diff --git a/docs/design/coreclr/botr/vectors-and-intrinsics.md b/docs/design/coreclr/botr/vectors-and-intrinsics.md index 2fc93df7e8ee68..18f1ec07e145a2 100644 --- a/docs/design/coreclr/botr/vectors-and-intrinsics.md +++ b/docs/design/coreclr/botr/vectors-and-intrinsics.md @@ -40,8 +40,8 @@ For AOT compilation, the situation is far more complex. This is due to the follo ## Crossgen2 model of hardware intrinsic usage There are 2 sets of instruction sets known to the compiler. -- The baseline instruction set which defaults to (Sse, Sse2), but may be adjusted via compiler option. -- The optimistic instruction set which defaults to (Sse3, Ssse3, Sse41, Sse42, Popcnt, Pclmulqdq, and Lzcnt). +- The baseline instruction set which defaults to x86-64-v2 (SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and POPCNT), but may be adjusted via compiler option. +- The optimistic instruction set which defaults to (AES, GFNI, SHA, WAITPKG, and X86SERIALIZE). Code will be compiled using the optimistic instruction set to drive compilation, but any use of an instruction set beyond the baseline instruction set will be recorded, as will any attempt to use an instruction set beyond the optimistic set if that attempted use has a semantic effect. If the baseline instruction set includes `Avx2` then the size and characteristics of of `Vector` is known. Any other decisions about ABI may also be encoded. For instance, it is likely that the ABI of `Vector256` and `Vector512` will vary based on the presence/absence of `Avx` support. diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml index d2c33c20f0a692..8f0356dec9c996 100644 --- a/eng/pipelines/common/templates/runtimes/run-test-job.yml +++ b/eng/pipelines/common/templates/runtimes/run-test-job.yml @@ -359,74 +359,31 @@ jobs: - jitstress_random_2 ${{ if in(parameters.testGroup, 'jitstress-isas-arm') }}: scenarios: - - jitstress_isas_incompletehwintrinsic - jitstress_isas_nohwintrinsic - - jitstress_isas_nohwintrinsic_nosimd - - jitstress_isas_nosimd ${{ if in(parameters.testGroup, 'jitstress-isas-x86') }}: scenarios: - - jitstress_isas_incompletehwintrinsic - jitstress_isas_nohwintrinsic - - jitstress_isas_nohwintrinsic_nosimd - - jitstress_isas_nosimd - jitstress_isas_x86_evex - - jitstress_isas_x86_noaes - jitstress_isas_x86_noavx - jitstress_isas_x86_noavx2 - jitstress_isas_x86_noavx512 - - jitstress_isas_x86_nobmi1 - - jitstress_isas_x86_nobmi2 - - jitstress_isas_x86_nofma - - jitstress_isas_x86_nohwintrinsic - - jitstress_isas_x86_nolzcnt - - jitstress_isas_x86_nopclmulqdq - - jitstress_isas_x86_nopopcnt - - jitstress_isas_x86_nosse - - jitstress_isas_x86_nosse2 - - jitstress_isas_x86_nosse3 - - jitstress_isas_x86_nosse3_4 - - jitstress_isas_x86_nosse41 - - jitstress_isas_x86_nosse42 - - jitstress_isas_x86_nossse3 - jitstress_isas_x86_vectort128 - jitstress_isas_x86_vectort512 - jitstress_isas_x86_noavx512_vectort128 - - jitstress_isas_1_x86_noaes + - jitstress_isas_1_x86_evex - jitstress_isas_1_x86_noavx - jitstress_isas_1_x86_noavx2 - jitstress_isas_1_x86_noavx512 - - jitstress_isas_1_x86_nobmi1 - - jitstress_isas_1_x86_nobmi2 - - jitstress_isas_1_x86_nofma - - jitstress_isas_1_x86_nohwintrinsic - - jitstress_isas_1_x86_nolzcnt - - jitstress_isas_1_x86_nopclmulqdq - - jitstress_isas_1_x86_nopopcnt - - jitstress_isas_1_x86_nosse - - jitstress_isas_1_x86_nosse2 - - jitstress_isas_1_x86_nosse3 - - jitstress_isas_1_x86_nosse3_4 - - jitstress_isas_1_x86_nosse41 - - jitstress_isas_1_x86_nosse42 - - jitstress_isas_1_x86_nossse3 - - jitstress_isas_2_x86_noaes + - jitstress_isas_1_x86_vectort128 + - jitstress_isas_1_x86_vectort512 + - jitstress_isas_1_x86_noavx512_vectort128 + - jitstress_isas_2_x86_evex - jitstress_isas_2_x86_noavx - jitstress_isas_2_x86_noavx2 - jitstress_isas_2_x86_noavx512 - - jitstress_isas_2_x86_nobmi1 - - jitstress_isas_2_x86_nobmi2 - - jitstress_isas_2_x86_nofma - - jitstress_isas_2_x86_nohwintrinsic - - jitstress_isas_2_x86_nolzcnt - - jitstress_isas_2_x86_nopclmulqdq - - jitstress_isas_2_x86_nopopcnt - - jitstress_isas_2_x86_nosse - - jitstress_isas_2_x86_nosse2 - - jitstress_isas_2_x86_nosse3 - - jitstress_isas_2_x86_nosse3_4 - - jitstress_isas_2_x86_nosse41 - - jitstress_isas_2_x86_nosse42 - - jitstress_isas_2_x86_nossse3 + - jitstress_isas_2_x86_vectort128 + - jitstress_isas_2_x86_vectort512 + - jitstress_isas_2_x86_noavx512_vectort128 ${{ if in(parameters.testGroup, 'jitstress-isas-avx512') }}: scenarios: - jitstress_isas_x86_evex diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 882a88839a933e..16cce0e76a3016 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -669,7 +669,6 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableHWIntrinsic, W("EnableHWIntri #endif // defined(TARGET_LOONGARCH64) #if defined(TARGET_AMD64) || defined(TARGET_X86) -RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableSSE42, W("EnableSSE42"), 1, "Allows SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, and dependent hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX, W("EnableAVX"), 1, "Allows AVX and dependent hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX2, W("EnableAVX2"), 1, "Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512, W("EnableAVX512"), 1, "Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled") diff --git a/src/coreclr/inc/corinfoinstructionset.h b/src/coreclr/inc/corinfoinstructionset.h index 9d2f2aa9c482f6..7fe0c24d57b11d 100644 --- a/src/coreclr/inc/corinfoinstructionset.h +++ b/src/coreclr/inc/corinfoinstructionset.h @@ -51,101 +51,97 @@ enum CORINFO_InstructionSet #endif // TARGET_RISCV64 #ifdef TARGET_AMD64 InstructionSet_X86Base=1, - InstructionSet_SSE42=2, - InstructionSet_AVX=3, - InstructionSet_AVX2=4, - InstructionSet_AVX512=5, - InstructionSet_AVX512v2=6, - InstructionSet_AVX512v3=7, - InstructionSet_AVX10v1=8, - InstructionSet_AVX10v2=9, - InstructionSet_APX=10, - InstructionSet_AES=11, - InstructionSet_AES_V256=12, - InstructionSet_AES_V512=13, - InstructionSet_AVX512VP2INTERSECT=14, - InstructionSet_AVXIFMA=15, - InstructionSet_AVXVNNI=16, - InstructionSet_GFNI=17, - InstructionSet_GFNI_V256=18, - InstructionSet_GFNI_V512=19, - InstructionSet_SHA=20, - InstructionSet_WAITPKG=21, - InstructionSet_X86Serialize=22, - InstructionSet_Vector128=23, - InstructionSet_Vector256=24, - InstructionSet_Vector512=25, - InstructionSet_VectorT128=26, - InstructionSet_VectorT256=27, - InstructionSet_VectorT512=28, - InstructionSet_AVXVNNIINT=29, - InstructionSet_AVXVNNIINT_V512=30, - InstructionSet_X86Base_X64=31, - InstructionSet_SSE42_X64=32, - InstructionSet_AVX_X64=33, - InstructionSet_AVX2_X64=34, - InstructionSet_AVX512_X64=35, - InstructionSet_AVX512v2_X64=36, - InstructionSet_AVX512v3_X64=37, - InstructionSet_AVX10v1_X64=38, - InstructionSet_AVX10v2_X64=39, - InstructionSet_AES_X64=40, - InstructionSet_AVX512VP2INTERSECT_X64=41, - InstructionSet_AVXIFMA_X64=42, - InstructionSet_AVXVNNI_X64=43, - InstructionSet_GFNI_X64=44, - InstructionSet_SHA_X64=45, - InstructionSet_WAITPKG_X64=46, - InstructionSet_X86Serialize_X64=47, + InstructionSet_AVX=2, + InstructionSet_AVX2=3, + InstructionSet_AVX512=4, + InstructionSet_AVX512v2=5, + InstructionSet_AVX512v3=6, + InstructionSet_AVX10v1=7, + InstructionSet_AVX10v2=8, + InstructionSet_APX=9, + InstructionSet_AES=10, + InstructionSet_AES_V256=11, + InstructionSet_AES_V512=12, + InstructionSet_AVX512VP2INTERSECT=13, + InstructionSet_AVXIFMA=14, + InstructionSet_AVXVNNI=15, + InstructionSet_GFNI=16, + InstructionSet_GFNI_V256=17, + InstructionSet_GFNI_V512=18, + InstructionSet_SHA=19, + InstructionSet_WAITPKG=20, + InstructionSet_X86Serialize=21, + InstructionSet_Vector128=22, + InstructionSet_Vector256=23, + InstructionSet_Vector512=24, + InstructionSet_VectorT128=25, + InstructionSet_VectorT256=26, + InstructionSet_VectorT512=27, + InstructionSet_AVXVNNIINT=28, + InstructionSet_AVXVNNIINT_V512=29, + InstructionSet_X86Base_X64=30, + InstructionSet_AVX_X64=31, + InstructionSet_AVX2_X64=32, + InstructionSet_AVX512_X64=33, + InstructionSet_AVX512v2_X64=34, + InstructionSet_AVX512v3_X64=35, + InstructionSet_AVX10v1_X64=36, + InstructionSet_AVX10v2_X64=37, + InstructionSet_AES_X64=38, + InstructionSet_AVX512VP2INTERSECT_X64=39, + InstructionSet_AVXIFMA_X64=40, + InstructionSet_AVXVNNI_X64=41, + InstructionSet_GFNI_X64=42, + InstructionSet_SHA_X64=43, + InstructionSet_WAITPKG_X64=44, + InstructionSet_X86Serialize_X64=45, #endif // TARGET_AMD64 #ifdef TARGET_X86 InstructionSet_X86Base=1, - InstructionSet_SSE42=2, - InstructionSet_AVX=3, - InstructionSet_AVX2=4, - InstructionSet_AVX512=5, - InstructionSet_AVX512v2=6, - InstructionSet_AVX512v3=7, - InstructionSet_AVX10v1=8, - InstructionSet_AVX10v2=9, - InstructionSet_APX=10, - InstructionSet_AES=11, - InstructionSet_AES_V256=12, - InstructionSet_AES_V512=13, - InstructionSet_AVX512VP2INTERSECT=14, - InstructionSet_AVXIFMA=15, - InstructionSet_AVXVNNI=16, - InstructionSet_GFNI=17, - InstructionSet_GFNI_V256=18, - InstructionSet_GFNI_V512=19, - InstructionSet_SHA=20, - InstructionSet_WAITPKG=21, - InstructionSet_X86Serialize=22, - InstructionSet_Vector128=23, - InstructionSet_Vector256=24, - InstructionSet_Vector512=25, - InstructionSet_VectorT128=26, - InstructionSet_VectorT256=27, - InstructionSet_VectorT512=28, - InstructionSet_AVXVNNIINT=29, - InstructionSet_AVXVNNIINT_V512=30, - InstructionSet_X86Base_X64=31, - InstructionSet_SSE42_X64=32, - InstructionSet_AVX_X64=33, - InstructionSet_AVX2_X64=34, - InstructionSet_AVX512_X64=35, - InstructionSet_AVX512v2_X64=36, - InstructionSet_AVX512v3_X64=37, - InstructionSet_AVX10v1_X64=38, - InstructionSet_AVX10v2_X64=39, - InstructionSet_AES_X64=40, - InstructionSet_AVX512VP2INTERSECT_X64=41, - InstructionSet_AVXIFMA_X64=42, - InstructionSet_AVXVNNI_X64=43, - InstructionSet_GFNI_X64=44, - InstructionSet_SHA_X64=45, - InstructionSet_WAITPKG_X64=46, - InstructionSet_X86Serialize_X64=47, + InstructionSet_AVX=2, + InstructionSet_AVX2=3, + InstructionSet_AVX512=4, + InstructionSet_AVX512v2=5, + InstructionSet_AVX512v3=6, + InstructionSet_AVX10v1=7, + InstructionSet_AVX10v2=8, + InstructionSet_APX=9, + InstructionSet_AES=10, + InstructionSet_AES_V256=11, + InstructionSet_AES_V512=12, + InstructionSet_AVX512VP2INTERSECT=13, + InstructionSet_AVXIFMA=14, + InstructionSet_AVXVNNI=15, + InstructionSet_GFNI=16, + InstructionSet_GFNI_V256=17, + InstructionSet_GFNI_V512=18, + InstructionSet_SHA=19, + InstructionSet_WAITPKG=20, + InstructionSet_X86Serialize=21, + InstructionSet_Vector128=22, + InstructionSet_Vector256=23, + InstructionSet_Vector512=24, + InstructionSet_VectorT128=25, + InstructionSet_VectorT256=26, + InstructionSet_VectorT512=27, + InstructionSet_AVXVNNIINT=28, + InstructionSet_AVXVNNIINT_V512=29, + InstructionSet_X86Base_X64=30, + InstructionSet_AVX_X64=31, + InstructionSet_AVX2_X64=32, + InstructionSet_AVX512_X64=33, + InstructionSet_AVX512v2_X64=34, + InstructionSet_AVX512v3_X64=35, + InstructionSet_AVX10v1_X64=36, + InstructionSet_AVX10v2_X64=37, + InstructionSet_AES_X64=38, + InstructionSet_AVX512VP2INTERSECT_X64=39, + InstructionSet_AVXIFMA_X64=40, + InstructionSet_AVXVNNI_X64=41, + InstructionSet_GFNI_X64=42, + InstructionSet_SHA_X64=43, + InstructionSet_WAITPKG_X64=44, + InstructionSet_X86Serialize_X64=45, #endif // TARGET_X86 }; @@ -267,8 +263,6 @@ struct CORINFO_InstructionSetFlags #ifdef TARGET_AMD64 if (HasInstructionSet(InstructionSet_X86Base)) AddInstructionSet(InstructionSet_X86Base_X64); - if (HasInstructionSet(InstructionSet_SSE42)) - AddInstructionSet(InstructionSet_SSE42_X64); if (HasInstructionSet(InstructionSet_AVX)) AddInstructionSet(InstructionSet_AVX_X64); if (HasInstructionSet(InstructionSet_AVX2)) @@ -395,10 +389,6 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_X86Base); if (resultflags.HasInstructionSet(InstructionSet_X86Base_X64) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) resultflags.RemoveInstructionSet(InstructionSet_X86Base_X64); - if (resultflags.HasInstructionSet(InstructionSet_SSE42) && !resultflags.HasInstructionSet(InstructionSet_SSE42_X64)) - resultflags.RemoveInstructionSet(InstructionSet_SSE42); - if (resultflags.HasInstructionSet(InstructionSet_SSE42_X64) && !resultflags.HasInstructionSet(InstructionSet_SSE42)) - resultflags.RemoveInstructionSet(InstructionSet_SSE42_X64); if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_AVX_X64)) resultflags.RemoveInstructionSet(InstructionSet_AVX); if (resultflags.HasInstructionSet(InstructionSet_AVX_X64) && !resultflags.HasInstructionSet(InstructionSet_AVX)) @@ -459,9 +449,7 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_X86Serialize); if (resultflags.HasInstructionSet(InstructionSet_X86Serialize_X64) && !resultflags.HasInstructionSet(InstructionSet_X86Serialize)) resultflags.RemoveInstructionSet(InstructionSet_X86Serialize_X64); - if (resultflags.HasInstructionSet(InstructionSet_SSE42) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) - resultflags.RemoveInstructionSet(InstructionSet_SSE42); - if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_SSE42)) + if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) resultflags.RemoveInstructionSet(InstructionSet_AVX); if (resultflags.HasInstructionSet(InstructionSet_AVX2) && !resultflags.HasInstructionSet(InstructionSet_AVX)) resultflags.RemoveInstructionSet(InstructionSet_AVX2); @@ -491,7 +479,7 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_AVXIFMA); if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2)) resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI); - if (resultflags.HasInstructionSet(InstructionSet_GFNI) && !resultflags.HasInstructionSet(InstructionSet_SSE42)) + if (resultflags.HasInstructionSet(InstructionSet_GFNI) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) resultflags.RemoveInstructionSet(InstructionSet_GFNI); if (resultflags.HasInstructionSet(InstructionSet_GFNI_V256) && !resultflags.HasInstructionSet(InstructionSet_GFNI)) resultflags.RemoveInstructionSet(InstructionSet_GFNI_V256); @@ -525,9 +513,7 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_VectorT512); #endif // TARGET_AMD64 #ifdef TARGET_X86 - if (resultflags.HasInstructionSet(InstructionSet_SSE42) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) - resultflags.RemoveInstructionSet(InstructionSet_SSE42); - if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_SSE42)) + if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) resultflags.RemoveInstructionSet(InstructionSet_AVX); if (resultflags.HasInstructionSet(InstructionSet_AVX2) && !resultflags.HasInstructionSet(InstructionSet_AVX)) resultflags.RemoveInstructionSet(InstructionSet_AVX2); @@ -557,7 +543,7 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_AVXIFMA); if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2)) resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI); - if (resultflags.HasInstructionSet(InstructionSet_GFNI) && !resultflags.HasInstructionSet(InstructionSet_SSE42)) + if (resultflags.HasInstructionSet(InstructionSet_GFNI) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) resultflags.RemoveInstructionSet(InstructionSet_GFNI); if (resultflags.HasInstructionSet(InstructionSet_GFNI_V256) && !resultflags.HasInstructionSet(InstructionSet_GFNI)) resultflags.RemoveInstructionSet(InstructionSet_GFNI_V256); @@ -673,10 +659,6 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet) return "X86Base"; case InstructionSet_X86Base_X64 : return "X86Base_X64"; - case InstructionSet_SSE42 : - return "SSE42"; - case InstructionSet_SSE42_X64 : - return "SSE42_X64"; case InstructionSet_AVX : return "AVX"; case InstructionSet_AVX_X64 : @@ -767,8 +749,6 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet) #ifdef TARGET_X86 case InstructionSet_X86Base : return "X86Base"; - case InstructionSet_SSE42 : - return "SSE42"; case InstructionSet_AVX : return "AVX"; case InstructionSet_AVX2 : @@ -869,11 +849,11 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base; case READYTORUN_INSTRUCTION_Sse: return InstructionSet_X86Base; case READYTORUN_INSTRUCTION_Sse2: return InstructionSet_X86Base; - case READYTORUN_INSTRUCTION_Sse42: return InstructionSet_SSE42; - case READYTORUN_INSTRUCTION_Sse3: return InstructionSet_SSE42; - case READYTORUN_INSTRUCTION_Ssse3: return InstructionSet_SSE42; - case READYTORUN_INSTRUCTION_Sse41: return InstructionSet_SSE42; - case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_SSE42; + case READYTORUN_INSTRUCTION_Sse42: return InstructionSet_X86Base; + case READYTORUN_INSTRUCTION_Sse3: return InstructionSet_X86Base; + case READYTORUN_INSTRUCTION_Ssse3: return InstructionSet_X86Base; + case READYTORUN_INSTRUCTION_Sse41: return InstructionSet_X86Base; + case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_X86Base; case READYTORUN_INSTRUCTION_Avx: return InstructionSet_AVX; case READYTORUN_INSTRUCTION_Avx2: return InstructionSet_AVX2; case READYTORUN_INSTRUCTION_Bmi1: return InstructionSet_AVX2; @@ -938,11 +918,11 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base; case READYTORUN_INSTRUCTION_Sse: return InstructionSet_X86Base; case READYTORUN_INSTRUCTION_Sse2: return InstructionSet_X86Base; - case READYTORUN_INSTRUCTION_Sse42: return InstructionSet_SSE42; - case READYTORUN_INSTRUCTION_Sse3: return InstructionSet_SSE42; - case READYTORUN_INSTRUCTION_Ssse3: return InstructionSet_SSE42; - case READYTORUN_INSTRUCTION_Sse41: return InstructionSet_SSE42; - case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_SSE42; + case READYTORUN_INSTRUCTION_Sse42: return InstructionSet_X86Base; + case READYTORUN_INSTRUCTION_Sse3: return InstructionSet_X86Base; + case READYTORUN_INSTRUCTION_Ssse3: return InstructionSet_X86Base; + case READYTORUN_INSTRUCTION_Sse41: return InstructionSet_X86Base; + case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_X86Base; case READYTORUN_INSTRUCTION_Avx: return InstructionSet_AVX; case READYTORUN_INSTRUCTION_Avx2: return InstructionSet_AVX2; case READYTORUN_INSTRUCTION_Bmi1: return InstructionSet_AVX2; diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index 203f13bb671239..6b67bc26ee3b9b 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -37,11 +37,11 @@ #include -constexpr GUID JITEEVersionIdentifier = { /* 7a8cbc56-9e19-4321-80b9-a0d2c578c945 */ - 0x7a8cbc56, - 0x9e19, - 0x4321, - {0x80, 0xb9, 0xa0, 0xd2, 0xc5, 0x78, 0xc9, 0x45} +constexpr GUID JITEEVersionIdentifier = { /* 4c03a921-f305-47db-a9bb-c7ec4a1b83d8 */ + 0x4c03a921, + 0xf305, + 0x47db, + {0xa9, 0xbb, 0xc7, 0xec, 0x4a, 0x1b, 0x83, 0xd8} }; #endif // JIT_EE_VERSIONING_GUID_H diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index c30f63f7be3281..8ac3bc1add5d2a 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -251,17 +251,16 @@ bool IntegralRange::Contains(int64_t value) const case NI_X86Base_CompareScalarUnorderedLessThan: case NI_X86Base_CompareScalarUnorderedGreaterThanOrEqual: case NI_X86Base_CompareScalarUnorderedGreaterThan: - case NI_SSE42_TestC: - case NI_SSE42_TestZ: - case NI_SSE42_TestNotZAndNotC: + case NI_X86Base_TestC: + case NI_X86Base_TestZ: + case NI_X86Base_TestNotZAndNotC: case NI_AVX_TestC: case NI_AVX_TestZ: case NI_AVX_TestNotZAndNotC: return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::One}; case NI_X86Base_Extract: - case NI_SSE42_Extract: - case NI_SSE42_X64_Extract: + case NI_X86Base_X64_Extract: case NI_Vector128_ToScalar: case NI_Vector256_ToScalar: case NI_Vector512_ToScalar: @@ -278,8 +277,8 @@ bool IntegralRange::Contains(int64_t value) const case NI_AVX2_TrailingZeroCount: case NI_AVX2_X64_LeadingZeroCount: case NI_AVX2_X64_TrailingZeroCount: - case NI_SSE42_PopCount: - case NI_SSE42_X64_PopCount: + case NI_X86Base_PopCount: + case NI_X86Base_X64_PopCount: // Note: No advantage in using a precise range for IntegralRange. // Example: IntCns = 42 gives [0..127] with a non -precise range, [42,42] with a precise range. return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::ByteMax}; diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 2a2c7a6b8656fd..6fadbfa9aede89 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -47,11 +47,11 @@ class CodeGen final : public CodeGenInterface private: #if defined(TARGET_XARCH) - // Generates SSE2 code for the given tree as "Operand BitWiseOp BitMask" - void genSSE2BitwiseOp(GenTree* treeNode); + // Generates intrinsic code for the given tree as "Operand BitWiseOp BitMask" + void genIntrinsicBitwiseOp(GenTree* treeNode); - // Generates SSE42 code for the given tree as a round operation - void genSSE42RoundOp(GenTreeOp* treeNode); + // Generates intrinsic code for the given tree as a round operation + void genIntrinsicRoundOp(GenTreeOp* treeNode); instruction simdAlignedMovIns() { @@ -942,7 +942,6 @@ class CodeGen final : public CodeGenInterface void genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); void genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); - void genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); void genFmaIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions); void genPermuteVar2x(GenTreeHWIntrinsic* node, insOpts instOptions); diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index bf179416c404c4..0174916bce4476 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -1561,9 +1561,10 @@ instruction CodeGen::genGetVolatileLdStIns(instruction currentIns, const bool addrIsInReg = indir->Addr()->isUsedFromReg(); // With RCPC2 (arm64 v8.4+) we can work with simple addressing modes like [reg + simm9] - const bool shouldUseRcpc2 = compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc2) && !addrIsInReg && - indir->Addr()->OperIs(GT_LEA) && !indir->HasIndex() && (indir->Scale() == 1) && - emitter::emitIns_valid_imm_for_unscaled_ldst_offset(indir->Offset()); + const bool shouldUseRcpc2 = !addrIsInReg && indir->Addr()->OperIs(GT_LEA) && !indir->HasIndex() && + (indir->Scale() == 1) && + emitter::emitIns_valid_imm_for_unscaled_ldst_offset(indir->Offset()) && + compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc2); if (shouldUseRcpc2) { diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index af4031d3520511..a891a6bc68229b 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -707,7 +707,7 @@ void CodeGen::genCodeForNegNot(GenTree* tree) if (varTypeIsFloating(targetType)) { assert(tree->OperIs(GT_NEG)); - genSSE2BitwiseOp(tree); + genIntrinsicBitwiseOp(tree); } else { @@ -1430,18 +1430,7 @@ void CodeGen::genSIMDSplitReturn(GenTree* src, const ReturnTypeDesc* retTypeDesc inst_Mov(TYP_INT, reg0, opReg, /* canSkip */ false); // reg1 = opRef[61:32] - if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - inst_RV_TT_IV(INS_pextrd, EA_4BYTE, reg1, src, 1, INS_OPTS_NONE); - } - else - { - bool isRMW = !compiler->canUseVexEncoding(); - int8_t shuffleMask = 1; // we only need [61:32]->[31:0], the rest is not read. - - inst_RV_RV_TT_IV(INS_pshufd, EA_8BYTE, opReg, opReg, src, shuffleMask, isRMW, INS_OPTS_NONE); - inst_Mov(TYP_INT, reg1, opReg, /* canSkip */ false); - } + inst_RV_TT_IV(INS_pextrd, EA_4BYTE, reg1, src, 1, INS_OPTS_NONE); #endif // TARGET_X86 } @@ -2457,17 +2446,7 @@ void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode) inst_Mov(TYP_FLOAT, targetReg, reg0, /* canSkip */ false); const emitAttr size = emitTypeSize(TYP_SIMD8); - if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - GetEmitter()->emitIns_SIMD_R_R_R_I(INS_pinsrd, size, targetReg, targetReg, reg1, 1, INS_OPTS_NONE); - } - else - { - regNumber tempXmm = internalRegisters.GetSingle(lclNode); - assert(tempXmm != targetReg); - inst_Mov(TYP_FLOAT, tempXmm, reg1, /* canSkip */ false); - GetEmitter()->emitIns_SIMD_R_R_R(INS_punpckldq, size, targetReg, targetReg, tempXmm, INS_OPTS_NONE); - } + GetEmitter()->emitIns_SIMD_R_R_R_I(INS_pinsrd, size, targetReg, targetReg, reg1, 1, INS_OPTS_NONE); genProduceReg(lclNode); } #elif defined(TARGET_AMD64) @@ -5788,8 +5767,7 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree) } case NI_X86Base_Extract: - case NI_SSE42_Extract: - case NI_SSE42_X64_Extract: + case NI_X86Base_X64_Extract: case NI_AVX_ExtractVector128: case NI_AVX2_ExtractVector128: case NI_AVX512_ExtractVector128: @@ -5805,15 +5783,6 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree) switch (ins) { - case INS_pextrw: - { - // The encoding which supports containment is SSE4.1+ only - assert(compiler->compIsaSupportedDebugOnly(InstructionSet_SSE42)); - - ins = INS_pextrw_sse42; - break; - } - case INS_vextractf64x2: { ins = INS_vextractf32x4; @@ -7740,7 +7709,7 @@ int CodeGenInterface::genCallerSPtoInitialSPdelta() const #endif // TARGET_AMD64 //----------------------------------------------------------------------------------------- -// genSSE2BitwiseOp - generate SSE2 code for the given oper as "Operand BitWiseOp BitMask" +// genIntrinsicBitwiseOp - generate intrinsic code for the given oper as "Operand BitWiseOp BitMask" // // Arguments: // treeNode - tree node @@ -7752,7 +7721,7 @@ int CodeGenInterface::genCallerSPtoInitialSPdelta() const // i) tree oper is one of GT_NEG or GT_INTRINSIC Abs() // ii) tree type is floating point type. // iii) caller of this routine needs to call genProduceReg() -void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) +void CodeGen::genIntrinsicBitwiseOp(GenTree* treeNode) { regNumber targetReg = treeNode->GetRegNum(); regNumber operandReg = genConsumeReg(treeNode->gtGetOp1()); @@ -7783,7 +7752,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) } else { - assert(!"genSSE2BitwiseOp: unsupported oper"); + assert(!"genIntrinsicBitwiseOp: unsupported oper"); } simd16_t constValue; @@ -7799,7 +7768,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) } //----------------------------------------------------------------------------------------- -// genSSE42RoundOp - generate SSE42 code for the given tree as a round operation +// genIntrinsicRoundOp - generate intrinsic code for the given tree as a round operation // // Arguments: // treeNode - tree node @@ -7808,17 +7777,13 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) // None // // Assumptions: -// i) SSE4.2 is supported by the underlying hardware -// ii) treeNode oper is a GT_INTRINSIC -// iii) treeNode type is a floating point type -// iv) treeNode is not used from memory -// v) tree oper is NI_System_Math{F}_Round, _Ceiling, _Floor, or _Truncate -// vi) caller of this routine needs to call genProduceReg() -void CodeGen::genSSE42RoundOp(GenTreeOp* treeNode) +// i) treeNode oper is a GT_INTRINSIC +// ii) treeNode type is a floating point type +// iii) treeNode is not used from memory +// iv) tree oper is NI_System_Math{F}_Round, _Ceiling, _Floor, or _Truncate +// v) caller of this routine needs to call genProduceReg() +void CodeGen::genIntrinsicRoundOp(GenTreeOp* treeNode) { - // i) SSE4.2 is supported by the underlying hardware - assert(compiler->compIsaSupportedDebugOnly(InstructionSet_SSE42)); - // ii) treeNode oper is a GT_INTRINSIC assert(treeNode->OperIs(GT_INTRINSIC)); @@ -7861,7 +7826,7 @@ void CodeGen::genSSE42RoundOp(GenTreeOp* treeNode) default: ins = INS_invalid; - assert(!"genSSE42RoundOp: unsupported intrinsic"); + assert(!"genRoundOp: unsupported intrinsic"); unreached(); } @@ -7884,14 +7849,14 @@ void CodeGen::genIntrinsic(GenTreeIntrinsic* treeNode) switch (treeNode->gtIntrinsicName) { case NI_System_Math_Abs: - genSSE2BitwiseOp(treeNode); + genIntrinsicBitwiseOp(treeNode); break; case NI_System_Math_Ceiling: case NI_System_Math_Floor: case NI_System_Math_Truncate: case NI_System_Math_Round: - genSSE42RoundOp(treeNode->AsOp()); + genIntrinsicRoundOp(treeNode->AsOp()); break; case NI_System_Math_Sqrt: diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 4680ba9eda5227..56b7ac6a82552f 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -6061,11 +6061,6 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr, instructionSetFlags.AddInstructionSet(InstructionSet_X86Base); - if (JitConfig.EnableSSE42() != 0) - { - instructionSetFlags.AddInstructionSet(InstructionSet_SSE42); - } - if (JitConfig.EnableAVX() != 0) { instructionSetFlags.AddInstructionSet(InstructionSet_AVX); diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index c67c62e6d38f03..70767278e390e1 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -1960,24 +1960,10 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIn simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet()); Range().InsertAfter(loResult, simdTmpVar); - GenTree* hiResult; - if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - GenTree* one = m_compiler->gtNewIconNode(1); - hiResult = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize); - - Range().InsertAfter(simdTmpVar, one, hiResult); - } - else - { - GenTree* thirtyTwo = m_compiler->gtNewIconNode(32); - GenTree* shift = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo, - node->GetSimdBaseJitType(), simdSize); - hiResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, shift, CORINFO_TYPE_INT, simdSize); - - Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult); - } + GenTree* one = m_compiler->gtNewIconNode(1); + GenTree* hiResult = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize); + Range().InsertAfter(simdTmpVar, one, hiResult); Range().Remove(node); return FinalizeDecomposition(use, loResult, hiResult, hiResult); diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 416b3e978345ef..5e2ee7592ddda2 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8292,12 +8292,8 @@ void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, reg if ((dataSize == 16) && (constValue->u64[1] == constValue->u64[0])) { - if (((cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) || - emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX)) - { - dataSize = 8; - ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd; - } + dataSize = 8; + ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd; } // `vbroadcastss` fills the full SIMD register, so we can't do this last step if the diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 30b5ea482838b8..d8cdd203ee8a2c 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -4024,7 +4024,6 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id) case INS_pextrd: case INS_pextrq: case INS_pextrw: - case INS_pextrw_sse42: case INS_rorx: case INS_shlx: case INS_sarx: @@ -7003,35 +7002,8 @@ void emitter::emitStoreSimd12ToLclOffset(unsigned varNum, unsigned offset, regNu // Store lower 8 bytes emitIns_S_R(INS_movsd_simd, EA_8BYTE, dataReg, varNum, offset); - if (emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // Extract and store upper 4 bytes - emitIns_S_R_I(INS_extractps, EA_16BYTE, varNum, offset + 8, dataReg, 2); - } - else if (tmpRegProvider != nullptr) - { - regNumber tmpReg = codeGen->internalRegisters.GetSingle(tmpRegProvider); - assert(isFloatReg(tmpReg)); - - // Extract upper 4 bytes from data - emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg); - - // Store upper 4 bytes - emitIns_S_R(INS_movss, EA_4BYTE, tmpReg, varNum, offset + 8); - } - else - { - // We don't have temp regs - let's do two shuffles then - - // [0,1,2,3] -> [2,3,0,1] - emitIns_R_R_I(INS_pshufd, EA_16BYTE, dataReg, dataReg, 78); - - // Store upper 4 bytes - emitIns_S_R(INS_movss, EA_4BYTE, dataReg, varNum, offset + 8); - - // Restore dataReg to its previous state: [2,3,0,1] -> [0,1,2,3] - emitIns_R_R_I(INS_pshufd, EA_16BYTE, dataReg, dataReg, 78); - } + // Extract and store upper 4 bytes + emitIns_S_R_I(INS_extractps, EA_16BYTE, varNum, offset + 8, dataReg, 2); } #endif // FEATURE_SIMD @@ -13628,7 +13600,6 @@ void emitter::emitDispIns( case INS_extractps: case INS_pextrb: case INS_pextrw: - case INS_pextrw_sse42: case INS_pextrd: { tgtAttr = EA_4BYTE; diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index 02bb832c24a4f2..fb03753cf2a51f 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -1182,8 +1182,8 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed case NI_AVX2_TrailingZeroCount: case NI_AVX2_X64_LeadingZeroCount: case NI_AVX2_X64_TrailingZeroCount: - case NI_SSE42_PopCount: - case NI_SSE42_X64_PopCount: + case NI_X86Base_PopCount: + case NI_X86Base_X64_PopCount: case NI_Vector256_Create: case NI_Vector512_Create: case NI_Vector256_CreateScalar: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2ae5c17f1132f7..10bde780a8534f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -18249,12 +18249,6 @@ bool Compiler::IsValidForShuffle( { assert(simdSize == 16); - if (varTypeIsSmall(simdBaseType) && (!compOpportunisticallyDependsOn(InstructionSet_SSE42))) - { - // TYP_BYTE, TYP_UBYTE, TYP_SHORT, and TYP_USHORT need SSSE3 to be able to shuffle any operation - return false; - } - bool isVariableShuffle = !indices->IsCnsVec(); if ((!isVariableShuffle) && isShuffleNative) { @@ -18269,16 +18263,6 @@ bool Compiler::IsValidForShuffle( } } } - if (isVariableShuffle && (!compOpportunisticallyDependsOn(InstructionSet_SSE42))) - { - // the variable implementation for Vector128 Shuffle always needs SSSE3 - // however, this can become valid later if it becomes constant - if (canBecomeValid != nullptr) - { - *canBecomeValid = true; - } - return false; - } } #endif // TARGET_XARCH @@ -20115,6 +20099,11 @@ bool GenTree::isCommutativeHWIntrinsic() const switch (id) { #ifdef TARGET_XARCH + case NI_X86Base_MultiplyAddAdjacent: + { + return !varTypeIsShort(node->GetSimdBaseType()); + } + case NI_X86Base_Max: case NI_X86Base_Min: { @@ -20187,11 +20176,10 @@ bool GenTree::isContainableHWIntrinsic() const case NI_Vector512_ToScalar: case NI_X86Base_ConvertToInt32: case NI_X86Base_ConvertToUInt32: + case NI_X86Base_Extract: case NI_X86Base_X64_ConvertToInt64: case NI_X86Base_X64_ConvertToUInt64: - case NI_X86Base_Extract: - case NI_SSE42_Extract: - case NI_SSE42_X64_Extract: + case NI_X86Base_X64_Extract: case NI_AVX_ExtractVector128: case NI_AVX2_ConvertToInt32: case NI_AVX2_ConvertToUInt32: @@ -20234,8 +20222,8 @@ bool GenTree::isContainableHWIntrinsic() const return true; } - case NI_SSE42_LoadAndDuplicateToVector128: - case NI_SSE42_MoveAndDuplicate: + case NI_X86Base_LoadAndDuplicateToVector128: + case NI_X86Base_MoveAndDuplicate: case NI_AVX_BroadcastScalarToVector128: case NI_AVX_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector128: @@ -20819,8 +20807,6 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si if (varTypeIsFloating(simdBaseType)) { // Abs(v) = v & ~new vector(-0.0); - assert((simdSize != 32) || compIsaSupportedDebugOnly(InstructionSet_AVX)); - GenTree* bitMask; if (simdBaseType == TYP_FLOAT) @@ -20838,30 +20824,21 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si NamedIntrinsic intrinsic = NI_Illegal; - if (simdBaseType == TYP_LONG) + if ((simdSize == 64) || (simdBaseType == TYP_LONG)) { if (compOpportunisticallyDependsOn(InstructionSet_AVX512)) { intrinsic = NI_AVX512_Abs; } - else - { - assert(simdSize != 64); - } } else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); intrinsic = NI_AVX2_Abs; } - else if (simdSize == 64) - { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); - intrinsic = NI_AVX512_Abs; - } - else if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) + else { - intrinsic = NI_SSE42_Abs; + intrinsic = NI_X86Base_Abs; } if (intrinsic != NI_Illegal) @@ -21197,9 +21174,7 @@ GenTree* Compiler::gtNewSimdBinOpNode( { if (simdBaseType == TYP_INT) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX) || - compIsaSupportedDebugOnly(InstructionSet_AVX512)); - + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(simdSize == 16 || simdSize == 32); NamedIntrinsic divIntrinsic = simdSize == 16 ? NI_Vector128_op_Division : NI_Vector256_op_Division; @@ -21218,8 +21193,6 @@ GenTree* Compiler::gtNewSimdBinOpNode( #if defined(TARGET_XARCH) if (varTypeIsByte(simdBaseType)) { - assert((simdSize != 64) || compIsaSupportedDebugOnly(InstructionSet_AVX512)); - CorInfoType widenedSimdBaseJitType; NamedIntrinsic widenIntrinsic; NamedIntrinsic narrowIntrinsic; @@ -21551,19 +21524,16 @@ GenTree* Compiler::gtNewSimdCeilNode(var_types type, GenTree* op1, CorInfoType s #if defined(TARGET_XARCH) if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_Ceiling; } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); GenTree* op2 = gtNewIconNode(static_cast(FloatRoundingMode::ToPositiveInfinity)); return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX512_RoundScale, simdBaseJitType, simdSize); } else { - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42)); - intrinsic = NI_SSE42_Ceiling; + intrinsic = NI_X86Base_Ceiling; } #elif defined(TARGET_ARM64) if (simdBaseType == TYP_DOUBLE) @@ -21631,9 +21601,7 @@ GenTree* Compiler::gtNewSimdCvtNode(var_types type, assert(varTypeIsIntegral(simdTargetBaseType)); #if defined(TARGET_XARCH) - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512) || - ((simdTargetBaseType == TYP_INT) && ((simdSize == 16 && compIsaSupportedDebugOnly(InstructionSet_SSE42)) || - (simdSize == 32 && compIsaSupportedDebugOnly(InstructionSet_AVX))))); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512) || (simdTargetBaseType == TYP_INT)); GenTree* fixupVal; @@ -21772,9 +21740,7 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, NamedIntrinsic hwIntrinsicID = NI_Illegal; #if defined(TARGET_XARCH) - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512) || - ((simdTargetBaseType == TYP_INT) && - ((simdSize == 16) || (simdSize == 32 && compIsaSupportedDebugOnly(InstructionSet_AVX))))); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512) || (simdTargetBaseType == TYP_INT)); switch (simdSourceBaseJitType) { @@ -22086,8 +22052,7 @@ GenTree* Compiler::gtNewSimdCmpOpNode( // NOTE: technically, we can special case byte type to only require SSE2, but it // complicates the test matrix for little gains. - if (((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2)) || - ((simdSize == 16) && compOpportunisticallyDependsOn(InstructionSet_SSE42))) + if ((simdSize == 16) || ((simdSize == 32) && compOpportunisticallyDependsOn(InstructionSet_AVX2))) { // TODO-AVX512: We can use this trick for longs only with AVX-512 if (!varTypeIsLong(simdBaseType)) @@ -22325,14 +22290,11 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( { if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); - intrinsic = NI_Vector256_op_Equality; } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_Vector512_op_Equality; } else @@ -22361,7 +22323,6 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_Vector512_op_Equality; } else @@ -22480,7 +22441,6 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode( } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_Vector512_op_Inequality; } else @@ -22508,12 +22468,10 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode( { if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_Vector512_op_Inequality; } else if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); intrinsic = NI_Vector256_op_Inequality; @@ -22599,7 +22557,6 @@ GenTree* Compiler::gtNewSimdCndSelNode( } else if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_Vector256_ConditionalSelect; } else @@ -23197,8 +23154,6 @@ GenTree* Compiler::gtNewSimdDotProdNode( } else { - assert(((simdBaseType != TYP_INT) && (simdBaseType != TYP_UINT)) || - compIsaSupportedDebugOnly(InstructionSet_SSE42)); intrinsic = NI_Vector128_Dot; } #elif defined(TARGET_ARM64) @@ -23232,14 +23187,12 @@ GenTree* Compiler::gtNewSimdFloorNode(var_types type, GenTree* op1, CorInfoType } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); GenTree* op2 = gtNewIconNode(static_cast(FloatRoundingMode::ToNegativeInfinity)); return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX512_RoundScale, simdBaseJitType, simdSize); } else { - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42)); - intrinsic = NI_SSE42_Floor; + intrinsic = NI_X86Base_Floor; } #elif defined(TARGET_ARM64) if (simdBaseType == TYP_DOUBLE) @@ -23281,7 +23234,6 @@ GenTree* Compiler::gtNewSimdFmaNode( #if defined(TARGET_XARCH) if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_AVX512_FusedMultiplyAdd; } else @@ -23325,35 +23277,6 @@ GenTree* Compiler::gtNewSimdGetElementNode( return gtNewSimdToScalarNode(type, op1, simdBaseJitType, simdSize); } - switch (simdBaseType) - { - case TYP_BYTE: - case TYP_UBYTE: - case TYP_INT: - case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: - { - // Using software fallback if simdBaseType is not supported by hardware - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42)); - break; - } - - case TYP_DOUBLE: - case TYP_FLOAT: - case TYP_SHORT: - case TYP_USHORT: - { - // Supported by baseline ISA requirement - break; - } - - default: - { - unreached(); - } - } - if (simdSize == 64) { intrinsicId = NI_Vector512_GetElement; @@ -24136,12 +24059,10 @@ GenTree* Compiler::gtNewSimdLoadAlignedNode(var_types type, if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_AVX512_LoadAlignedVector512; } else if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_LoadAlignedVector256; } else @@ -24204,24 +24125,18 @@ GenTree* Compiler::gtNewSimdLoadNonTemporalNode(var_types type, } else { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_LoadAlignedVector256; } } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_AVX512_LoadAlignedVector512NonTemporal; isNonTemporal = true; } - else if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - intrinsic = NI_SSE42_LoadAlignedVector128NonTemporal; - isNonTemporal = true; - } else { - intrinsic = NI_X86Base_LoadAlignedVector128; + intrinsic = NI_X86Base_LoadAlignedVector128NonTemporal; + isNonTemporal = true; } if (isNonTemporal) @@ -24296,7 +24211,7 @@ GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, assert(varTypeIsFloating(type)); assert(simdBaseType == type); } - else + else if (!varTypeIsLong(simdBaseType)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -24949,8 +24864,6 @@ GenTree* Compiler::gtNewSimdMinMaxNativeNode( #if defined(TARGET_XARCH) if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); - if (varTypeIsFloating(simdBaseType)) { intrinsic = isMax ? NI_AVX_Max : NI_AVX_Min; @@ -24971,135 +24884,28 @@ GenTree* Compiler::gtNewSimdMinMaxNativeNode( } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = isMax ? NI_AVX512_Max : NI_AVX512_Min; } - else + else if (!varTypeIsLong(simdBaseType)) { - switch (simdBaseType) + if (isScalar) { - case TYP_BYTE: - case TYP_USHORT: - { - if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - intrinsic = isMax ? NI_SSE42_Max : NI_SSE42_Min; - break; - } - - GenTree* constVal = nullptr; - CorInfoType opJitType = simdBaseJitType; - var_types opType = simdBaseType; - genTreeOps fixupOp1 = GT_NONE; - genTreeOps fixupOp2 = GT_NONE; - - switch (simdBaseType) - { - case TYP_BYTE: - { - constVal = gtNewIconNode(0x80808080); - fixupOp1 = GT_SUB; - fixupOp2 = GT_ADD; - simdBaseJitType = CORINFO_TYPE_UBYTE; - simdBaseType = TYP_UBYTE; - break; - } - - case TYP_USHORT: - { - constVal = gtNewIconNode(0x80008000); - fixupOp1 = GT_ADD; - fixupOp2 = GT_SUB; - simdBaseJitType = CORINFO_TYPE_SHORT; - simdBaseType = TYP_SHORT; - break; - } - - default: - { - unreached(); - } - } - - assert(constVal != nullptr); - assert(fixupOp1 != GT_NONE); - assert(fixupOp2 != GT_NONE); - assert(opJitType != simdBaseJitType); - assert(opType != simdBaseType); - - GenTree* constVector = gtNewSimdCreateBroadcastNode(type, constVal, CORINFO_TYPE_INT, simdSize); - - GenTree* constVectorDup1 = fgMakeMultiUse(&constVector); - GenTree* constVectorDup2 = gtCloneExpr(constVectorDup1); - - // op1 = op1 - constVector - // -or- - // op1 = op1 + constVector - op1 = gtNewSimdBinOpNode(fixupOp1, type, op1, constVector, opJitType, simdSize); - - // op2 = op2 - constVectorDup1 - // -or- - // op2 = op2 + constVectorDup1 - op2 = gtNewSimdBinOpNode(fixupOp1, type, op2, constVectorDup1, opJitType, simdSize); - - // op1 = Min(op1, op2) - // -or- - // op1 = Max(op1, op2) - op1 = gtNewSimdMinMaxNativeNode(type, op1, op2, simdBaseJitType, simdSize, isMax); - - // result = op1 + constVectorDup2 - // -or- - // result = op1 - constVectorDup2 - return gtNewSimdBinOpNode(fixupOp2, type, op1, constVectorDup2, opJitType, simdSize); - } - - case TYP_INT: - case TYP_UINT: - { - if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - intrinsic = isMax ? NI_SSE42_Max : NI_SSE42_Min; - } - break; - } - - case TYP_LONG: - case TYP_ULONG: - { - if (compOpportunisticallyDependsOn(InstructionSet_AVX512)) - { - intrinsic = isMax ? NI_AVX512_Max : NI_AVX512_Min; - } - break; - } - - case TYP_FLOAT: - case TYP_UBYTE: - case TYP_SHORT: - case TYP_DOUBLE: - { - if (isScalar) - { - simdSize = 16; - type = TYP_SIMD16; - - op1 = gtNewSimdCreateScalarUnsafeNode(type, op1, simdBaseJitType, simdSize); - op2 = gtNewSimdCreateScalarUnsafeNode(type, op2, simdBaseJitType, simdSize); + simdSize = 16; + type = TYP_SIMD16; - intrinsic = isMax ? NI_X86Base_MaxScalar : NI_X86Base_MinScalar; - } - else - { - intrinsic = isMax ? NI_X86Base_Max : NI_X86Base_Min; - } - break; - } + op1 = gtNewSimdCreateScalarUnsafeNode(type, op1, simdBaseJitType, simdSize); + op2 = gtNewSimdCreateScalarUnsafeNode(type, op2, simdBaseJitType, simdSize); - default: - { - unreached(); - } + intrinsic = isMax ? NI_X86Base_MaxScalar : NI_X86Base_MinScalar; } + else + { + intrinsic = isMax ? NI_X86Base_Max : NI_X86Base_Min; + } + } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX512)) + { + intrinsic = isMax ? NI_AVX512_Max : NI_AVX512_Min; } #elif defined(TARGET_ARM64) if (!varTypeIsLong(simdBaseType)) @@ -25179,7 +24985,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( #if defined(TARGET_XARCH) GenTree* tmp3; - GenTree* tmp4; if (compOpportunisticallyDependsOn(InstructionSet_AVX512)) { // This is the same in principle to the other comments below, however due to @@ -25323,8 +25128,6 @@ GenTree* Compiler::gtNewSimdNarrowNode( } else if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); - switch (simdBaseType) { case TYP_BYTE: @@ -25495,67 +25298,29 @@ GenTree* Compiler::gtNewSimdNarrowNode( // op1 = Elements 0, 1, 2, 3; 0L, 0U, 1L, 1U, 2L, 2U, 3L, 3U // op2 = Elements 4, 5, 6, 7; 4L, 4U, 5L, 5U, 6L, 6U, 7L, 7U // - // ... - if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // ... - // - // tmp2 = Elements 0L, --, 1L, --, 2L, --, 3L, -- - // tmp3 = Elements 4L, --, 5L, --, 6L, --, 7L, -- - // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L - // - // var vcns = Vector128.Create(0x0000FFFF).AsInt16(); - // var tmp1 = Sse2.And(op1.AsInt16(), vcns); - // var tmp2 = Sse2.And(op2.AsInt16(), vcns); - // return Sse2.PackUnsignedSaturate(tmp1, tmp2).As(); - - GenTreeVecCon* vecCon1 = gtNewVconNode(type); - - for (unsigned i = 0; i < (simdSize / 8); i++) - { - vecCon1->gtSimdVal.u64[i] = 0x0000FFFF0000FFFF; - } - - GenTree* vecCon2 = gtCloneExpr(vecCon1); + // tmp2 = Elements 0L, --, 1L, --, 2L, --, 3L, -- + // tmp3 = Elements 4L, --, 5L, --, 6L, --, 7L, -- + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L + // + // var vcns = Vector128.Create(0x0000FFFF).AsInt16(); + // var tmp1 = Sse2.And(op1.AsInt16(), vcns); + // var tmp2 = Sse2.And(op2.AsInt16(), vcns); + // return Sse2.PackUnsignedSaturate(tmp1, tmp2).As(); - tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); - tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); + GenTreeVecCon* vecCon1 = gtNewVconNode(type); - return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE42_PackUnsignedSaturate, - CORINFO_TYPE_USHORT, simdSize); - } - else + for (unsigned i = 0; i < (simdSize / 8); i++) { - // ... - // - // tmp1 = Elements 0L, 4L, 0U, 4U, 1L, 5L, 1U, 5U - // tmp2 = Elements 2L, 6L, 2U, 6U, 3L, 7L, 3U, 7U - // tmp3 = Elements 0L, 2L, 4L, 6L, 0U, 2U, 4U, 6U - // tmp4 = Elements 1L, 3L, 5L, 7L, 1U, 3U, 5U, 7U - // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L - // - // var tmp1 = Sse2.UnpackLow(op1.AsUInt16(), op2.AsUInt16()); - // var tmp2 = Sse2.UnpackHigh(op1.AsUInt16(), op2.AsUInt16()); - // var tmp3 = Sse2.UnpackLow(tmp1, tmp2); - // var tmp4 = Sse2.UnpackHigh(tmp1, tmp2); - // return Sse2.UnpackLow(tmp3, tmp4).As(); - - GenTree* op1Dup = fgMakeMultiUse(&op1); - GenTree* op2Dup = fgMakeMultiUse(&op2); - - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_X86Base_UnpackLow, simdBaseJitType, simdSize); - tmp2 = gtNewSimdHWIntrinsicNode(type, op1Dup, op2Dup, NI_X86Base_UnpackHigh, simdBaseJitType, - simdSize); + vecCon1->gtSimdVal.u64[i] = 0x0000FFFF0000FFFF; + } - GenTree* tmp1Dup = fgMakeMultiUse(&tmp1); - GenTree* tmp2Dup = fgMakeMultiUse(&tmp2); + GenTree* vecCon2 = gtCloneExpr(vecCon1); - tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_X86Base_UnpackLow, simdBaseJitType, simdSize); - tmp4 = gtNewSimdHWIntrinsicNode(type, tmp1Dup, tmp2Dup, NI_X86Base_UnpackHigh, simdBaseJitType, - simdSize); + tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize); + tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize); - return gtNewSimdHWIntrinsicNode(type, tmp3, tmp4, NI_X86Base_UnpackLow, simdBaseJitType, simdSize); - } + return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_X86Base_PackUnsignedSaturate, CORINFO_TYPE_USHORT, + simdSize); } case TYP_INT: @@ -25689,19 +25454,16 @@ GenTree* Compiler::gtNewSimdRoundNode(var_types type, GenTree* op1, CorInfoType #if defined(TARGET_XARCH) if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_RoundToNearestInteger; } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); GenTree* op2 = gtNewIconNode(static_cast(FloatRoundingMode::ToNearestInteger)); return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX512_RoundScale, simdBaseJitType, simdSize); } else { - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42)); - intrinsic = NI_SSE42_RoundToNearestInteger; + intrinsic = NI_X86Base_RoundToNearestInteger; } #elif defined(TARGET_ARM64) if (simdBaseType == TYP_DOUBLE) @@ -25792,16 +25554,12 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( } else if (elementSize == 2) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); - // swap the operands to match the encoding requirements retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512_PermuteVar32x16, simdBaseJitType, simdSize); retNode->SetReverseOp(); } else if (elementSize == 4) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); - // swap the operands to match the encoding requirements retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512_PermuteVar16x32, simdBaseJitType, simdSize); retNode->SetReverseOp(); @@ -25809,7 +25567,6 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( else { assert(elementSize == 8); - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); // swap the operands to match the encoding requirements retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512_PermuteVar8x64, simdBaseJitType, simdSize); @@ -25818,9 +25575,7 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( } else if ((elementSize == 1) && (simdSize == 16)) { - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42)); - - retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSE42_Shuffle, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_X86Base_Shuffle, simdBaseJitType, simdSize); // high bit on index gives 0 already canUseSignedComparisonHint = true; @@ -25835,7 +25590,6 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( } else if ((elementSize == 2) && compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - assert((simdSize == 16) || (simdSize == 32)); NamedIntrinsic intrinsic = (simdSize == 16) ? NI_AVX512_PermuteVar8x16 : NI_AVX512_PermuteVar16x16; // swap the operands to match the encoding requirements @@ -26113,7 +25867,6 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( } else { - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42)); assert(simdSize == 16); assert(elementSize > 1); @@ -26156,7 +25909,7 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( cnsNode = gtNewVconNode(type); cnsNode->AsVecCon()->gtSimdVal = shufCns; - op2 = gtNewSimdHWIntrinsicNode(type, op2, cnsNode, NI_SSE42_Shuffle, simdBaseJitType, simdSize); + op2 = gtNewSimdHWIntrinsicNode(type, op2, cnsNode, NI_X86Base_Shuffle, simdBaseJitType, simdSize); // or the relevant bits @@ -26173,7 +25926,7 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( // apply normal byte shuffle now that we've converted it - retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSE42_Shuffle, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_X86Base_Shuffle, simdBaseJitType, simdSize); } } #elif defined(TARGET_ARM64) @@ -26280,11 +26033,8 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( bool subComparandNode = false; #if defined(TARGET_XARCH) - // check if we have hardware accelerated unsigned comparison - bool hardwareAcceleratedUnsignedComparison = compOpportunisticallyDependsOn(InstructionSet_AVX512); - // if the hardware doesn't support direct unsigned comparison, we attempt to use signed comparison - if (!hardwareAcceleratedUnsignedComparison) + if (!compOpportunisticallyDependsOn(InstructionSet_AVX512)) { corType = CORINFO_TYPE_BYTE; if (elementSize == 2) @@ -26719,8 +26469,6 @@ GenTree* Compiler::gtNewSimdShuffleNode( } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); - if (!crossLane) { // if element size is 64-bit, try to use vshufpd instead of vpshufb. @@ -26830,18 +26578,17 @@ GenTree* Compiler::gtNewSimdShuffleNode( return retNode; } - else + else if (needsZero) { - if (needsZero && compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; - - op2 = gtNewVconNode(type); - op2->AsVecCon()->gtSimd16Val = vecCns.v128[0]; + simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; - return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSE42_Shuffle, simdBaseJitType, simdSize); - } + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimd16Val = vecCns.v128[0]; + return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_X86Base_Shuffle, simdBaseJitType, simdSize); + } + else + { if (varTypeIsLong(simdBaseType)) { // TYP_LONG and TYP_ULONG don't have their own shuffle/permute instructions and so we'll @@ -26876,7 +26623,7 @@ GenTree* Compiler::gtNewSimdShuffleNode( if (needsZero) { - assert((simdSize == 32) || (!compIsaSupportedDebugOnly(InstructionSet_SSE42))); + assert(simdSize == 32); op2 = gtNewVconNode(type); op2->AsVecCon()->gtSimdVal = mskCns; @@ -26954,12 +26701,10 @@ GenTree* Compiler::gtNewSimdSqrtNode(var_types type, GenTree* op1, CorInfoType s #if defined(TARGET_XARCH) if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_Sqrt; } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_AVX512_Sqrt; } else @@ -27037,12 +26782,10 @@ GenTree* Compiler::gtNewSimdStoreAlignedNode(GenTree* op1, GenTree* op2, CorInfo if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_StoreAligned; } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_AVX512_StoreAligned; } else @@ -27094,12 +26837,10 @@ GenTree* Compiler::gtNewSimdStoreNonTemporalNode(GenTree* op1, if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_AVX512_StoreAlignedNonTemporal; } else if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_StoreAlignedNonTemporal; } else @@ -27138,7 +26879,6 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, CorInfoType si if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); GenTree* op1Dup = fgMakeMultiUse(&op1); op1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseJitType, simdSize); @@ -27162,7 +26902,6 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, CorInfoType si if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); GenTree* op1Dup = fgMakeMultiUse(&op1); op1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseJitType, simdSize); @@ -27194,7 +26933,6 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, CorInfoType si if (compOpportunisticallyDependsOn(InstructionSet_AVX)) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); // The permute below gives us [0, 1, 2, 3] -> [1, 0, 3, 2] op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode((int)0b10110001, TYP_INT), NI_AVX_Permute, simdBaseJitType, simdSize); @@ -27230,7 +26968,6 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, CorInfoType si if (compOpportunisticallyDependsOn(InstructionSet_AVX)) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); // The permute below gives us [0, 1] -> [1, 0] op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode((int)0b0001, TYP_INT), NI_AVX_Permute, simdBaseJitType, simdSize); @@ -27416,12 +27153,10 @@ GenTree* Compiler::gtNewSimdToScalarNode(var_types type, GenTree* op1, CorInfoTy #ifdef TARGET_XARCH if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); intrinsic = NI_Vector512_ToScalar; } else if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_Vector256_ToScalar; } else @@ -27473,19 +27208,16 @@ GenTree* Compiler::gtNewSimdTruncNode(var_types type, GenTree* op1, CorInfoType #if defined(TARGET_XARCH) if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_RoundToZero; } else if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); GenTree* op2 = gtNewIconNode(static_cast(FloatRoundingMode::ToZero)); return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX512_RoundScale, simdBaseJitType, simdSize); } else { - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42)); - intrinsic = NI_SSE42_RoundToZero; + intrinsic = NI_X86Base_RoundToZero; } #elif defined(TARGET_ARM64) if (simdBaseType == TYP_DOUBLE) @@ -27618,8 +27350,6 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, GenTree* op1, CorInfo #if defined(TARGET_XARCH) if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); - tmp1 = gtNewSimdGetLowerNode(TYP_SIMD32, op1, simdBaseJitType, simdSize); switch (simdBaseType) @@ -27677,7 +27407,6 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, GenTree* op1, CorInfo } else if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); tmp1 = gtNewSimdGetLowerNode(TYP_SIMD16, op1, simdBaseJitType, simdSize); @@ -27720,28 +27449,28 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, GenTree* op1, CorInfo assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize); } - else if ((simdBaseType == TYP_FLOAT) || compOpportunisticallyDependsOn(InstructionSet_SSE42)) + else { switch (simdBaseType) { case TYP_BYTE: case TYP_UBYTE: { - intrinsic = NI_SSE42_ConvertToVector128Int16; + intrinsic = NI_X86Base_ConvertToVector128Int16; break; } case TYP_SHORT: case TYP_USHORT: { - intrinsic = NI_SSE42_ConvertToVector128Int32; + intrinsic = NI_X86Base_ConvertToVector128Int32; break; } case TYP_INT: case TYP_UINT: { - intrinsic = NI_SSE42_ConvertToVector128Int64; + intrinsic = NI_X86Base_ConvertToVector128Int64; break; } @@ -27760,19 +27489,6 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, GenTree* op1, CorInfo assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); } - else - { - tmp1 = gtNewZeroConNode(type); - - if (varTypeIsSigned(simdBaseType)) - { - GenTree* op1Dup = fgMakeMultiUse(&op1); - - tmp1 = gtNewSimdHWIntrinsicNode(type, op1Dup, tmp1, NI_X86Base_CompareLessThan, simdBaseJitType, simdSize); - } - - return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_X86Base_UnpackLow, simdBaseJitType, simdSize); - } #elif defined(TARGET_ARM64) if (simdSize == 16) { @@ -27830,8 +27546,6 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, GenTree* op1, CorInfo #if defined(TARGET_XARCH) if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512)); - tmp1 = gtNewSimdGetUpperNode(TYP_SIMD32, op1, simdBaseJitType, simdSize); switch (simdBaseType) @@ -27889,7 +27603,6 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, GenTree* op1, CorInfo } else if (simdSize == 32) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); tmp1 = gtNewSimdGetUpperNode(TYP_SIMD16, op1, simdBaseJitType, simdSize); @@ -27941,7 +27654,7 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, GenTree* op1, CorInfo tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op1Dup, NI_X86Base_MoveHighToLow, simdBaseJitType, simdSize); return gtNewSimdHWIntrinsicNode(type, tmp1, NI_X86Base_ConvertToVector128Double, simdBaseJitType, simdSize); } - else if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) + else { tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(8), NI_X86Base_ShiftRightLogical128BitLane, simdBaseJitType, simdSize); @@ -27951,21 +27664,21 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, GenTree* op1, CorInfo case TYP_BYTE: case TYP_UBYTE: { - intrinsic = NI_SSE42_ConvertToVector128Int16; + intrinsic = NI_X86Base_ConvertToVector128Int16; break; } case TYP_SHORT: case TYP_USHORT: { - intrinsic = NI_SSE42_ConvertToVector128Int32; + intrinsic = NI_X86Base_ConvertToVector128Int32; break; } case TYP_INT: case TYP_UINT: { - intrinsic = NI_SSE42_ConvertToVector128Int64; + intrinsic = NI_X86Base_ConvertToVector128Int64; break; } @@ -27978,19 +27691,6 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, GenTree* op1, CorInfo assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize); } - else - { - tmp1 = gtNewZeroConNode(type); - - if (varTypeIsSigned(simdBaseType)) - { - GenTree* op1Dup = fgMakeMultiUse(&op1); - - tmp1 = gtNewSimdHWIntrinsicNode(type, op1Dup, tmp1, NI_X86Base_CompareLessThan, simdBaseJitType, simdSize); - } - - return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_X86Base_UnpackHigh, simdBaseJitType, simdSize); - } #elif defined(TARGET_ARM64) if (simdSize == 16) { @@ -28050,31 +27750,7 @@ GenTree* Compiler::gtNewSimdWithElementNode( assert(varTypeIsArithmetic(op3)); #if defined(TARGET_XARCH) - switch (simdBaseType) - { - // Using software fallback if simdBaseType is not supported by hardware - case TYP_BYTE: - case TYP_UBYTE: - case TYP_INT: - case TYP_UINT: - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42)); - break; - - case TYP_LONG: - case TYP_ULONG: - assert(compIsaSupportedDebugOnly(InstructionSet_SSE42_X64)); - break; - - case TYP_DOUBLE: - case TYP_FLOAT: - case TYP_SHORT: - case TYP_USHORT: - // Supported by baseline ISA requirement - break; - - default: - unreached(); - } + assert(!varTypeIsLong(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_X86Base_X64)); if (simdSize == 64) { @@ -28512,9 +28188,9 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const switch (intrinsicId) { - case NI_SSE42_ConvertToVector128Int16: - case NI_SSE42_ConvertToVector128Int32: - case NI_SSE42_ConvertToVector128Int64: + case NI_X86Base_ConvertToVector128Int16: + case NI_X86Base_ConvertToVector128Int32: + case NI_X86Base_ConvertToVector128Int64: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX2_ConvertToVector256Int16: @@ -28740,12 +28416,12 @@ bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const NamedIntrinsic intrinsicId = GetHWIntrinsicId(); switch (intrinsicId) { - case NI_AVX2_BroadcastScalarToVector128: - case NI_AVX2_BroadcastScalarToVector256: + case NI_X86Base_LoadAndDuplicateToVector128: + case NI_X86Base_MoveAndDuplicate: case NI_AVX_BroadcastScalarToVector128: case NI_AVX_BroadcastScalarToVector256: - case NI_SSE42_LoadAndDuplicateToVector128: - case NI_SSE42_MoveAndDuplicate: + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512_BroadcastScalarToVector512: return true; default: @@ -29352,7 +29028,6 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty #if defined(TARGET_XARCH) case NI_X86Base_MultiplyLow: - case NI_SSE42_MultiplyLow: case NI_AVX_Multiply: case NI_AVX2_MultiplyLow: case NI_AVX512_MultiplyLow: @@ -29535,7 +29210,6 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty #if defined(TARGET_XARCH) case NI_X86Base_CompareEqual: - case NI_SSE42_CompareEqual: case NI_AVX_CompareEqual: case NI_AVX2_CompareEqual: case NI_AVX512_CompareEqualMask: @@ -29568,7 +29242,6 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty #if defined(TARGET_XARCH) case NI_X86Base_CompareGreaterThan: - case NI_SSE42_CompareGreaterThan: case NI_AVX_CompareGreaterThan: case NI_AVX2_CompareGreaterThan: case NI_AVX512_CompareGreaterThanMask: @@ -29632,7 +29305,6 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty #if defined(TARGET_XARCH) case NI_X86Base_CompareLessThan: - case NI_SSE42_CompareLessThan: case NI_AVX_CompareLessThan: case NI_AVX2_CompareLessThan: case NI_AVX512_CompareLessThanMask: @@ -29739,15 +29411,9 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp( assert(varTypeIsSIMD(simdType)); #if defined(TARGET_XARCH) - if (simdSize == 64) + if ((simdSize == 64) || (simdSize == 32)) { assert(!isScalar); - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512)); - } - else if (simdSize == 32) - { - assert(!isScalar); - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); } else #endif // TARGET_XARCH @@ -29843,15 +29509,9 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(op2 != nullptr); #if defined(TARGET_XARCH) - if (simdSize == 64) - { - assert(!isScalar); - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512)); - } - else if (simdSize == 32) + if ((simdSize == 64) || (simdSize == 32)) { assert(!isScalar); - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); } else #endif // TARGET_XARCH @@ -29874,14 +29534,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, #if defined(TARGET_XARCH) if (simdSize == 64) { - if (varTypeIsSmall(simdBaseType)) - { - id = NI_AVX512_Add; - } - else - { - id = NI_AVX512_Add; - } + id = NI_AVX512_Add; } else if (simdSize == 32) { @@ -29924,14 +29577,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, #if defined(TARGET_XARCH) if (simdSize == 64) { - if (varTypeIsFloating(simdBaseType)) - { - id = NI_AVX512_And; - } - else - { - id = NI_AVX512_And; - } + id = NI_AVX512_And; } else if (simdSize == 32) { @@ -29971,14 +29617,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, #if defined(TARGET_XARCH) if (simdSize == 64) { - if (varTypeIsFloating(simdBaseType)) - { - id = NI_AVX512_AndNot; - } - else - { - id = NI_AVX512_AndNot; - } + id = NI_AVX512_AndNot; } else if (simdSize == 32) { @@ -30048,52 +29687,38 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(varTypeIsIntegral(simdBaseType)); #if defined(TARGET_XARCH) - if (simdSize == 64) + if (varTypeIsByte(simdBaseType)) { - if (varTypeIsShort(simdBaseType)) - { - id = varTypeIsInt(op2) ? NI_AVX512_ShiftLeftLogical : NI_AVX512_ShiftLeftLogicalVariable; - } - else if (!varTypeIsByte(simdBaseType)) - { - id = varTypeIsInt(op2) ? NI_AVX512_ShiftLeftLogical : NI_AVX512_ShiftLeftLogicalVariable; - } + break; } - else if (varTypeIsShort(simdBaseType)) + + if (varTypeIsInt(op2)) { - if (varTypeIsInt(op2)) - { - if (simdSize == 32) - { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); - id = NI_AVX2_ShiftLeftLogical; - } - else - { - id = NI_X86Base_ShiftLeftLogical; - } - } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) + if (simdSize == 64) { - id = NI_AVX512_ShiftLeftLogicalVariable; + id = NI_AVX512_ShiftLeftLogical; } - } - else if (!varTypeIsByte(simdBaseType)) - { - if (simdSize == 32) + else if (simdSize == 32) { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); - id = varTypeIsInt(op2) ? NI_AVX2_ShiftLeftLogical : NI_AVX2_ShiftLeftLogicalVariable; + id = NI_AVX2_ShiftLeftLogical; } - else if (varTypeIsInt(op2)) + else { id = NI_X86Base_ShiftLeftLogical; } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + } + else if ((simdSize == 64) || varTypeIsShort(simdBaseType)) + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - id = NI_AVX2_ShiftLeftLogicalVariable; + id = NI_AVX512_ShiftLeftLogicalVariable; } } + else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + id = NI_AVX2_ShiftLeftLogicalVariable; + } #elif defined(TARGET_ARM64) if ((simdSize == 8) && (genTypeSize(simdBaseType) == 8)) { @@ -30118,15 +29743,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, { id = NI_AVX512_Multiply; } - else if (varTypeIsLong(simdBaseType)) - { - id = NI_AVX512_MultiplyLow; - } - else if (varTypeIsInt(simdBaseType)) - { - id = NI_AVX512_MultiplyLow; - } - else if (varTypeIsShort(simdBaseType)) + else if (!varTypeIsByte(simdBaseType)) { id = NI_AVX512_MultiplyLow; } @@ -30154,14 +29771,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, { id = isScalar ? NI_X86Base_MultiplyScalar : NI_X86Base_Multiply; } - else if (varTypeIsInt(simdBaseType)) - { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - id = NI_SSE42_MultiplyLow; - } - } - else if (varTypeIsShort(simdBaseType)) + else if (!varTypeIsByte(simdBaseType)) { id = NI_X86Base_MultiplyLow; } @@ -30190,14 +29800,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, #if defined(TARGET_XARCH) if (simdSize == 64) { - if (varTypeIsFloating(simdBaseType)) - { - id = NI_AVX512_Or; - } - else - { - id = NI_AVX512_Or; - } + id = NI_AVX512_Or; } else if (simdSize == 32) { @@ -30229,19 +29832,9 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(varTypeIsIntegral(simdBaseType)); #if defined(TARGET_XARCH) - if (simdSize == 64) + if (!varTypeIsSmall(simdBaseType) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - if (!varTypeIsSmall(simdBaseType)) - { - id = varTypeIsInt(op2) ? NI_AVX512_RotateLeft : NI_AVX512_RotateLeftVariable; - } - } - else if (!varTypeIsSmall(simdBaseType)) - { - if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) - { - id = varTypeIsInt(op2) ? NI_AVX512_RotateLeft : NI_AVX512_RotateLeftVariable; - } + id = varTypeIsInt(op2) ? NI_AVX512_RotateLeft : NI_AVX512_RotateLeftVariable; } #endif // TARGET_XARCH break; @@ -30254,19 +29847,9 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(varTypeIsIntegral(simdBaseType)); #if defined(TARGET_XARCH) - if (simdSize == 64) + if (!varTypeIsSmall(simdBaseType) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - if (!varTypeIsSmall(simdBaseType)) - { - id = varTypeIsInt(op2) ? NI_AVX512_RotateRight : NI_AVX512_RotateRightVariable; - } - } - else if (!varTypeIsSmall(simdBaseType)) - { - if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) - { - id = varTypeIsInt(op2) ? NI_AVX512_RotateRight : NI_AVX512_RotateRightVariable; - } + id = varTypeIsInt(op2) ? NI_AVX512_RotateRight : NI_AVX512_RotateRightVariable; } #endif // TARGET_XARCH break; @@ -30279,59 +29862,41 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(varTypeIsIntegral(simdBaseType)); #if defined(TARGET_XARCH) - if (simdSize == 64) - { - if (varTypeIsShort(simdBaseType)) - { - id = varTypeIsInt(op2) ? NI_AVX512_ShiftRightArithmetic : NI_AVX512_ShiftRightArithmeticVariable; - } - else if (!varTypeIsByte(simdBaseType)) - { - id = varTypeIsInt(op2) ? NI_AVX512_ShiftRightArithmetic : NI_AVX512_ShiftRightArithmeticVariable; - } - } - else if (genTypeSize(simdBaseType) == 8) + if (varTypeIsByte(simdBaseType)) { - if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) - { - id = varTypeIsInt(op2) ? NI_AVX512_ShiftRightArithmetic : NI_AVX512_ShiftRightArithmeticVariable; - } + break; } - else if (varTypeIsShort(simdBaseType)) + + if (varTypeIsInt(op2)) { - if (varTypeIsInt(op2)) + if ((simdSize == 64) || (genTypeSize(simdBaseType) == 8)) { - if (simdSize == 32) + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); - id = NI_AVX2_ShiftRightArithmetic; + id = NI_AVX512_ShiftRightArithmetic; } - else - { - id = NI_X86Base_ShiftRightArithmetic; - } - } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) - { - id = NI_AVX512_ShiftRightArithmeticVariable; } - } - else if (!varTypeIsByte(simdBaseType)) - { - if (simdSize == 32) + else if (simdSize == 32) { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); - id = varTypeIsInt(op2) ? NI_AVX2_ShiftRightArithmetic : NI_AVX2_ShiftRightArithmeticVariable; + id = NI_AVX2_ShiftRightArithmetic; } - else if (varTypeIsInt(op2)) + else { id = NI_X86Base_ShiftRightArithmetic; } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + } + else if ((simdSize == 64) || varTypeIsShort(simdBaseType) || (genTypeSize(simdBaseType) == 8)) + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - id = NI_AVX2_ShiftRightArithmeticVariable; + id = NI_AVX512_ShiftRightArithmeticVariable; } } + else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + id = NI_AVX2_ShiftRightArithmeticVariable; + } #elif defined(TARGET_ARM64) if ((simdSize == 8) && (genTypeSize(simdBaseType) == 8)) { @@ -30352,52 +29917,38 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(varTypeIsIntegral(simdBaseType)); #if defined(TARGET_XARCH) - if (simdSize == 64) + if (varTypeIsByte(simdBaseType)) { - if (varTypeIsShort(simdBaseType)) - { - id = varTypeIsInt(op2) ? NI_AVX512_ShiftRightLogical : NI_AVX512_ShiftRightLogicalVariable; - } - else if (!varTypeIsByte(simdBaseType)) - { - id = varTypeIsInt(op2) ? NI_AVX512_ShiftRightLogical : NI_AVX512_ShiftRightLogicalVariable; - } + break; } - else if (varTypeIsShort(simdBaseType)) + + if (varTypeIsInt(op2)) { - if (varTypeIsInt(op2)) - { - if (simdSize == 32) - { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); - id = NI_AVX2_ShiftRightLogical; - } - else - { - id = NI_X86Base_ShiftRightLogical; - } - } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) + if (simdSize == 64) { - id = NI_AVX512_ShiftRightLogicalVariable; + id = NI_AVX512_ShiftRightLogical; } - } - else if (!varTypeIsByte(simdBaseType)) - { - if (simdSize == 32) + else if (simdSize == 32) { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); - id = varTypeIsInt(op2) ? NI_AVX2_ShiftRightLogical : NI_AVX2_ShiftRightLogicalVariable; + id = NI_AVX2_ShiftRightLogical; } - else if (varTypeIsInt(op2)) + else { id = NI_X86Base_ShiftRightLogical; } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + } + else if ((simdSize == 64) || varTypeIsShort(simdBaseType)) + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512)) { - id = NI_AVX2_ShiftRightLogicalVariable; + id = NI_AVX512_ShiftRightLogicalVariable; } } + else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + id = NI_AVX2_ShiftRightLogicalVariable; + } #elif defined(TARGET_ARM64) if ((simdSize == 8) && (genTypeSize(simdBaseType) == 8)) { @@ -30418,14 +29969,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, #if defined(TARGET_XARCH) if (simdSize == 64) { - if (varTypeIsSmall(simdBaseType)) - { - id = NI_AVX512_Subtract; - } - else - { - id = NI_AVX512_Subtract; - } + id = NI_AVX512_Subtract; } else if (simdSize == 32) { @@ -30468,14 +30012,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, #if defined(TARGET_XARCH) if (simdSize == 64) { - if (varTypeIsFloating(simdBaseType)) - { - id = NI_AVX512_Xor; - } - else - { - id = NI_AVX512_Xor; - } + id = NI_AVX512_Xor; } else if (simdSize == 32) { @@ -30554,7 +30091,6 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, else if (simdSize == 32) { assert(!isScalar); - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); } else #endif // TARGET_XARCH @@ -30614,13 +30150,6 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, id = NI_AVX_CompareEqual; } } - else if (varTypeIsLong(simdBaseType)) - { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - id = NI_SSE42_CompareEqual; - } - } else { id = isScalar ? NI_X86Base_CompareScalarEqual : NI_X86Base_CompareEqual; @@ -30696,13 +30225,6 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); id = NI_AVX2_CompareGreaterThan; } - else if (varTypeIsLong(simdBaseType)) - { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - id = NI_SSE42_CompareGreaterThan; - } - } else { id = NI_X86Base_CompareGreaterThan; @@ -30799,13 +30321,6 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); id = NI_AVX2_CompareLessThan; } - else if (varTypeIsLong(simdBaseType)) - { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - id = NI_SSE42_CompareLessThan; - } - } else { id = NI_X86Base_CompareLessThan; @@ -30982,7 +30497,7 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec #endif // TARGET_ARM64 #if defined(TARGET_XARCH) - case NI_SSE42_Insert: + case NI_X86Base_Insert: { // We can optimize for float when the constant is zero // due to a specialized encoding for the instruction @@ -32819,7 +32334,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - case NI_SSE42_PopCount: + case NI_X86Base_PopCount: { assert(!varTypeIsSmall(retType) && !varTypeIsLong(retType)); @@ -32831,7 +32346,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - case NI_SSE42_X64_PopCount: + case NI_X86Base_X64_PopCount: { assert(varTypeIsLong(retType)); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 588396c4a7253c..ca815d3c14a8b6 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -942,7 +942,6 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = { // clang-format off #if defined(TARGET_XARCH) { FIRST_NI_X86Base, LAST_NI_X86Base }, // X86Base - { FIRST_NI_SSE42, LAST_NI_SSE42 }, // SSE42 { FIRST_NI_AVX, LAST_NI_AVX }, // AVX { FIRST_NI_AVX2, LAST_NI_AVX2 }, // AVX2 { FIRST_NI_AVX512, LAST_NI_AVX512 }, // AVX512 @@ -973,7 +972,6 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = { { FIRST_NI_AVXVNNIINT_V512, LAST_NI_AVXVNNIINT_V512 }, // AVXVNNIINT_V512 { FIRST_NI_X86Base_X64, LAST_NI_X86Base_X64 }, // X86Base_X64 - { FIRST_NI_SSE42_X64, LAST_NI_SSE42_X64 }, // SSE42_X64 { NI_Illegal, NI_Illegal }, // AVX_X64 { FIRST_NI_AVX2_X64, LAST_NI_AVX2_X64 }, // AVX2_X64 { FIRST_NI_AVX512_X64, LAST_NI_AVX512_X64 }, // AVX512_X64 @@ -2265,9 +2263,9 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, #if defined(TARGET_XARCH) switch (intrinsic) { - case NI_SSE42_ConvertToVector128Int16: - case NI_SSE42_ConvertToVector128Int32: - case NI_SSE42_ConvertToVector128Int64: + case NI_X86Base_ConvertToVector128Int16: + case NI_X86Base_ConvertToVector128Int32: + case NI_X86Base_ConvertToVector128Int64: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX2_ConvertToVector256Int16: @@ -2323,7 +2321,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, : gtNewSimdHWIntrinsicNode(nodeRetType, op1, op2, intrinsic, simdBaseJitType, simdSize); #ifdef TARGET_XARCH - if ((intrinsic == NI_SSE42_Crc32) || (intrinsic == NI_SSE42_X64_Crc32)) + if ((intrinsic == NI_X86Base_Crc32) || (intrinsic == NI_X86Base_X64_Crc32)) { // TODO-XArch-Cleanup: currently we use the simdBaseJitType to bring the type of the second argument // to the code generator. May encode the overload info in other way. diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 9dc8a978174c9d..adf06a7982302b 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -869,7 +869,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { switch (intrinsicId) { - case NI_SSE42_BlendVariable: + case NI_X86Base_BlendVariable: case NI_AVX_BlendVariable: case NI_AVX2_BlendVariable: case NI_AVX512_BlendVariableMask: @@ -1005,13 +1005,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case InstructionSet_SSE42: - case InstructionSet_SSE42_X64: - { - genSse42Intrinsic(node, instOptions); - break; - } - case InstructionSet_AVX: case InstructionSet_AVX2: case InstructionSet_AVX2_X64: @@ -1908,19 +1901,9 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) if (!canCombineLoad) { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions); - inst_RV_RV_TT_IV(INS_pinsrd, EA_16BYTE, targetReg, targetReg, hiPart, 0x01, - !compiler->canUseVexEncoding(), instOptions); - } - else - { - regNumber tmpReg = internalRegisters.GetSingle(node); - genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions); - genHWIntrinsic_R_RM(node, ins, baseAttr, tmpReg, hiPart, instOptions); - emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, targetReg, tmpReg, instOptions); - } + genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions); + inst_RV_RV_TT_IV(INS_pinsrd, EA_16BYTE, targetReg, targetReg, hiPart, 0x01, + !compiler->canUseVexEncoding(), instOptions); break; } @@ -1961,26 +1944,17 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) if (baseType == TYP_FLOAT) { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // insertps imm8 is: - // * Bits 0-3: zmask - // * Bits 4-5: count_d - // * Bits 6-7: count_s (register form only) - // - // We want zmask 0b1110 (0xE) to zero elements 1/2/3 - // We want count_d 0b00 (0x0) to insert the value to element 0 - // We want count_s 0b00 (0x0) as we're just taking element 0 of the source - - emit->emitIns_SIMD_R_R_R_I(INS_insertps, attr, targetReg, targetReg, op1Reg, 0x0E, - instOptions); - } - else - { - assert(targetReg != op1Reg); - emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg, instOptions); - emit->emitIns_Mov(INS_movss, attr, targetReg, op1Reg, /* canSkip */ false); - } + // insertps imm8 is: + // * Bits 0-3: zmask + // * Bits 4-5: count_d + // * Bits 6-7: count_s (register form only) + // + // We want zmask 0b1110 (0xE) to zero elements 1/2/3 + // We want count_d 0b00 (0x0) to insert the value to element 0 + // We want count_s 0b00 (0x0) as we're just taking element 0 of the source + + emit->emitIns_SIMD_R_R_R_I(INS_insertps, attr, targetReg, targetReg, op1Reg, 0x0E, + instOptions); } else { @@ -2145,15 +2119,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) { if (ival == 1) { - if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - emit->emitIns_R_R(INS_movshdup, attr, targetReg, op1Reg); - } - else - { - emit->emitIns_SIMD_R_R_R_I(INS_shufps, attr, targetReg, op1Reg, op1Reg, - static_cast(0x55), instOptions); - } + emit->emitIns_R_R(INS_movshdup, attr, targetReg, op1Reg); } else if (ival == 2) { @@ -2564,40 +2530,11 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) break; } - default: - unreached(); - break; - } - - genProduceReg(node); -} - -//------------------------------------------------------------------------ -// genSse42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node -// -// Arguments: -// node - The hardware intrinsic node -// -void CodeGen::genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) -{ - NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); - regNumber targetReg = node->GetRegNum(); - GenTree* op1 = node->Op(1); - var_types baseType = node->GetSimdBaseType(); - var_types targetType = node->TypeGet(); - emitter* emit = GetEmitter(); - - assert(targetReg != REG_NA); - assert(!node->OperIsCommutative()); - - genConsumeMultiOpOperands(node); - - switch (intrinsicId) - { - case NI_SSE42_ConvertToVector128Int16: - case NI_SSE42_ConvertToVector128Int32: - case NI_SSE42_ConvertToVector128Int64: + case NI_X86Base_ConvertToVector128Int16: + case NI_X86Base_ConvertToVector128Int32: + case NI_X86Base_ConvertToVector128Int64: { + GenTree* op1 = node->Op(1); instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType, compiler); if (!varTypeIsSIMD(op1->TypeGet())) @@ -2614,12 +2551,13 @@ void CodeGen::genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) break; } - case NI_SSE42_Crc32: - case NI_SSE42_X64_Crc32: + case NI_X86Base_Crc32: + case NI_X86Base_X64_Crc32: { assert(instOptions == INS_OPTS_NONE); instruction ins = INS_crc32; + GenTree* op1 = node->Op(1); regNumber op1Reg = op1->GetRegNum(); GenTree* op2 = node->Op(2); @@ -2672,12 +2610,11 @@ void CodeGen::genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) break; } - case NI_SSE42_Extract: - case NI_SSE42_X64_Extract: + case NI_X86Base_Extract: + case NI_X86Base_X64_Extract: { - assert(!varTypeIsFloating(baseType)); - instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType, compiler); + GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); emitAttr attr = emitActualTypeSize(targetType); @@ -2704,18 +2641,16 @@ void CodeGen::genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) break; } - case NI_SSE42_PopCount: - case NI_SSE42_X64_PopCount: + case NI_X86Base_PopCount: + case NI_X86Base_X64_PopCount: { genXCNTIntrinsic(node, INS_popcnt); break; } default: - { unreached(); break; - } } genProduceReg(node); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index be680becff0b52..265f45c9121c8e 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -419,20 +419,27 @@ HARDWARE_INTRINSIC(Vector512, op_UnsignedRightShift, // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// Intrinsics for X86Base, SSE, SSE2 -#define FIRST_NI_X86Base NI_X86Base_Add +// Intrinsics for X86Base, SSE, SSE2, SSE3, SSSE3, SSE41, SSE42, POPCNT +#define FIRST_NI_X86Base NI_X86Base_Abs +HARDWARE_INTRINSIC(X86Base, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, Add, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(X86Base, AddSaturate, 16, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(X86Base, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(X86Base, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, AlignRight, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(X86Base, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandd, INS_pandd, INS_pandd, INS_pandd, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(X86Base, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandnd, INS_pandnd, INS_pandnd, INS_pandnd, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(X86Base, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(X86Base, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(X86Base, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, BlendVariable, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(X86Base, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(X86Base, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(X86Base, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(X86Base, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(X86Base, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(X86Base, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) @@ -473,39 +480,59 @@ HARDWARE_INTRINSIC(X86Base, ConvertToInt32, HARDWARE_INTRINSIC(X86Base, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si32, INS_cvttsd2si32}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, ConvertToUInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, ConvertToVector128Double, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2pd, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2pd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_cvttpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(X86Base, ConvertToVector128Single, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, Crc32, 0, 2, {INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic) HARDWARE_INTRINSIC(X86Base, DivRem, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_idiv, INS_div, INS_idiv, INS_div, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic) HARDWARE_INTRINSIC(X86Base, Divide, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(X86Base, DivideScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_divsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(X86Base, Extract, 16, 2, {INS_invalid, INS_invalid, INS_pextrw, INS_pextrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(X86Base, Insert, 16, 3, {INS_invalid, INS_invalid, INS_pinsrw, INS_pinsrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp) +HARDWARE_INTRINSIC(X86Base, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, Extract, 16, 2, {INS_pextrb, INS_pextrb, INS_pextrw, INS_pextrw, INS_pextrd, INS_pextrd, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(X86Base, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, Insert, 16, 3, {INS_pinsrb, INS_pinsrb, INS_pinsrw, INS_pinsrw, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp) HARDWARE_INTRINSIC(X86Base, LoadAlignedVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(X86Base, LoadAlignedVector128NonTemporal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(X86Base, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, LoadDquVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(X86Base, LoadFence, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Barrier) HARDWARE_INTRINSIC(X86Base, LoadHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_movhpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, LoadLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_movlpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, LoadScalarVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd32, INS_movd32, INS_movq, INS_movq, INS_movss, INS_movsd_simd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, LoadVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(X86Base, MaskMove, 16, 3, {INS_maskmovdqu, INS_maskmovdqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) -HARDWARE_INTRINSIC(X86Base, Max, 16, 2, {INS_invalid, INS_pmaxub, INS_pmaxsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) +HARDWARE_INTRINSIC(X86Base, Max, 16, 2, {INS_pmaxsb, INS_pmaxub, INS_pmaxsw, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(X86Base, MaxScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_maxsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(X86Base, MemoryFence, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Barrier) -HARDWARE_INTRINSIC(X86Base, Min, 16, 2, {INS_invalid, INS_pminub, INS_pminsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) +HARDWARE_INTRINSIC(X86Base, Min, 16, 2, {INS_pminsb, INS_pminub, INS_pminsw, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) +HARDWARE_INTRINSIC(X86Base, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(X86Base, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_minsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(X86Base, MoveAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, MoveHighAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, MoveHighToLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(X86Base, MoveLowAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, MoveLowToHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) HARDWARE_INTRINSIC(X86Base, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(X86Base, MoveScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(X86Base, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(X86Base, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(X86Base, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(X86Base, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(X86Base, MultiplyHigh, 16, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(X86Base, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(X86Base, MultiplyHighRoundScale, 16, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(X86Base, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(X86Base, MultiplyScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_mulsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(X86Base, Or, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pord, INS_pord, INS_pord, INS_pord, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(X86Base, PackSignedSaturate, 16, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(X86Base, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_packuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(X86Base, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_packuswb, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(X86Base, Pause, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other) +HARDWARE_INTRINSIC(X86Base, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(X86Base, Prefetch0, 0, 1, {INS_invalid, INS_prefetcht0, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other) HARDWARE_INTRINSIC(X86Base, Prefetch1, 0, 1, {INS_invalid, INS_prefetcht1, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other) HARDWARE_INTRINSIC(X86Base, Prefetch2, 0, 1, {INS_invalid, INS_prefetcht2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other) @@ -514,14 +541,25 @@ HARDWARE_INTRINSIC(X86Base, Reciprocal, HARDWARE_INTRINSIC(X86Base, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(X86Base, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(X86Base, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(X86Base, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(X86Base, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(X86Base, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(X86Base, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(X86Base, ShiftLeftLogical, 16, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(X86Base, ShiftLeftLogical128BitLane, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(X86Base, ShiftRightArithmetic, 16, 2, {INS_invalid, INS_invalid, INS_psraw, INS_invalid, INS_psrad, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(X86Base, ShiftRightLogical, 16, 2, {INS_invalid, INS_invalid, INS_psrlw, INS_psrlw, INS_psrld, INS_psrld, INS_psrlq, INS_psrlq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(X86Base, ShiftRightLogical128BitLane, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt) -HARDWARE_INTRINSIC(X86Base, Shuffle, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pshufd, INS_pshufd, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(X86Base, Shuffle, 16, -1, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_pshufd, INS_pshufd, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(X86Base, ShuffleHigh, 16, 2, {INS_invalid, INS_invalid, INS_pshufhw, INS_pshufhw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(X86Base, ShuffleLow, 16, 2, {INS_invalid, INS_invalid, INS_pshuflw, INS_pshuflw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(X86Base, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(X86Base, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, SqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_sqrtsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(X86Base, Store, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromSecondArg) @@ -536,6 +574,9 @@ HARDWARE_INTRINSIC(X86Base, Subtract, HARDWARE_INTRINSIC(X86Base, SubtractSaturate, 16, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(X86Base, SubtractScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_subsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(X86Base, SumAbsoluteDifferences, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(X86Base, TestC, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(X86Base, TestNotZAndNotC, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(X86Base, TestZ, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(X86Base, UnpackHigh, 16, 2, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(X86Base, UnpackLow, 16, 2, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(X86Base, Xor, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pxord, INS_pxord, INS_pxord, INS_pxord, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp|HW_Flag_NormalizeSmallTypeToInt) @@ -545,7 +586,7 @@ HARDWARE_INTRINSIC(X86Base, Xor, // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// 64-bit only Intrinsics for X86Base, SSE, SSE2 +// 64-bit only Intrinsics for X86Base, SSE, SSE2, SSE3, SSSE3, SSE41, SSE42, POPCNT #define FIRST_NI_X86Base_X64 NI_X86Base_X64_BitScanForward HARDWARE_INTRINSIC(X86Base_X64, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base_X64, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) @@ -556,88 +597,14 @@ HARDWARE_INTRINSIC(X86Base_X64, ConvertScalarToVector128UInt64, HARDWARE_INTRINSIC(X86Base_X64, ConvertToInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd64, INS_invalid, INS_cvtss2si64, INS_cvtsd2si64}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(X86Base_X64, ConvertToInt64WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si64, INS_cvttsd2si64}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(X86Base_X64, ConvertToUInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(X86Base_X64, Crc32, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_crc32, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic) HARDWARE_INTRINSIC(X86Base_X64, DivRem, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_idiv, INS_div, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic) +HARDWARE_INTRINSIC(X86Base_X64, Extract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pextrq, INS_pextrq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base_X64, Insert, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pinsrq, INS_pinsrq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp) +HARDWARE_INTRINSIC(X86Base_X64, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(X86Base_X64, StoreNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti64, INS_movnti64, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg) #define LAST_NI_X86Base_X64 NI_X86Base_X64_StoreNonTemporal -// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// ISA Function name SIMD size NumArg Instructions Category Flags -// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} -// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// Intrinsics for SSE3, SSSE3, SSE41, SSE42, POPCNT -#define FIRST_NI_SSE42 NI_SSE42_Abs -HARDWARE_INTRINSIC(SSE42, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, AlignRight, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt) -HARDWARE_INTRINSIC(SSE42, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, BlendVariable, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(SSE42, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE42, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(SSE42, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(SSE42, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(SSE42, Crc32, 0, 2, {INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(SSE42, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, Extract, 16, 2, {INS_pextrb, INS_pextrb, INS_invalid, INS_invalid, INS_pextrd, INS_pextrd, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE42, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, Insert, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp) -HARDWARE_INTRINSIC(SSE42, LoadAlignedVector128NonTemporal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NormalizeSmallTypeToInt) -HARDWARE_INTRINSIC(SSE42, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, LoadDquVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) -HARDWARE_INTRINSIC(SSE42, Max, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE42, Min, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE42, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, MoveAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, MoveHighAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, MoveLowAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE42, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE42, MultiplyHighRoundScale, 16, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE42, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE42, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE42, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(SSE42, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE42, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE42, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE42, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE42, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE42, Shuffle, 16, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE42, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, TestC, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) -HARDWARE_INTRINSIC(SSE42, TestNotZAndNotC, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) -HARDWARE_INTRINSIC(SSE42, TestZ, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) -#define LAST_NI_SSE42 NI_SSE42_TestZ - -// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// ISA Function name SIMD size NumArg Instructions Category Flags -// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} -// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// 64-bit only Intrinsics for SSE3, SSSE3, SSE41, SSE42, POPCNT -#define FIRST_NI_SSE42_X64 NI_SSE42_X64_Crc32 -HARDWARE_INTRINSIC(SSE42_X64, Crc32, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_crc32, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(SSE42_X64, Extract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pextrq, INS_pextrq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42_X64, Insert, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pinsrq, INS_pinsrq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp) -HARDWARE_INTRINSIC(SSE42_X64, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen) -#define LAST_NI_SSE42_X64 NI_SSE42_X64_PopCount - -// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// ISA Function name SIMD size NumArg Instructions Category Flags -// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} -// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // Intrinsics for AVX #define FIRST_NI_AVX NI_AVX_Add HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -1208,8 +1175,8 @@ HARDWARE_INTRINSIC(GFNI_V512, GaloisFieldMultiply, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // Special intrinsics that are generated during lowering HARDWARE_INTRINSIC(X86Base, COMIS, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(X86Base, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(X86Base, UCOMIS, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE42, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, AndNotVector, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandnd, INS_pandnd, INS_pandnd, INS_pandnd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX2, AndNotScalar, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index c614918b2074d5..0a18981bd2d476 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -20,8 +20,6 @@ static CORINFO_InstructionSet X64VersionOfIsa(CORINFO_InstructionSet isa) { case InstructionSet_X86Base: return InstructionSet_X86Base_X64; - case InstructionSet_SSE42: - return InstructionSet_SSE42_X64; case InstructionSet_AVX: return InstructionSet_AVX_X64; case InstructionSet_AVX2: @@ -333,7 +331,7 @@ CORINFO_InstructionSet Compiler::lookupInstructionSet(const char* className) } else if (strcmp(className + 1, "opcnt") == 0) { - return InstructionSet_SSE42; + return InstructionSet_X86Base; } } else if (className[0] == 'S') @@ -350,20 +348,20 @@ CORINFO_InstructionSet Compiler::lookupInstructionSet(const char* className) } else if (strcmp(className + 3, "3") == 0) { - return InstructionSet_SSE42; + return InstructionSet_X86Base; } else if (strcmp(className + 3, "41") == 0) { - return InstructionSet_SSE42; + return InstructionSet_X86Base; } else if (strcmp(className + 3, "42") == 0) { - return InstructionSet_SSE42; + return InstructionSet_X86Base; } } else if (strcmp(className + 1, "sse3") == 0) { - return InstructionSet_SSE42; + return InstructionSet_X86Base; } } else if (className[0] == 'V') @@ -1054,54 +1052,54 @@ int HWIntrinsicInfo::lookupIval(Compiler* comp, NamedIntrinsic id, var_types sim return static_cast(FloatComparisonMode::UnorderedNonSignaling); } - case NI_SSE42_Ceiling: - case NI_SSE42_CeilingScalar: + case NI_X86Base_Ceiling: + case NI_X86Base_CeilingScalar: case NI_AVX_Ceiling: { FALLTHROUGH; } - case NI_SSE42_RoundToPositiveInfinity: - case NI_SSE42_RoundToPositiveInfinityScalar: + case NI_X86Base_RoundToPositiveInfinity: + case NI_X86Base_RoundToPositiveInfinityScalar: case NI_AVX_RoundToPositiveInfinity: { assert(varTypeIsFloating(simdBaseType)); return static_cast(FloatRoundingMode::ToPositiveInfinity); } - case NI_SSE42_Floor: - case NI_SSE42_FloorScalar: + case NI_X86Base_Floor: + case NI_X86Base_FloorScalar: case NI_AVX_Floor: { FALLTHROUGH; } - case NI_SSE42_RoundToNegativeInfinity: - case NI_SSE42_RoundToNegativeInfinityScalar: + case NI_X86Base_RoundToNegativeInfinity: + case NI_X86Base_RoundToNegativeInfinityScalar: case NI_AVX_RoundToNegativeInfinity: { assert(varTypeIsFloating(simdBaseType)); return static_cast(FloatRoundingMode::ToNegativeInfinity); } - case NI_SSE42_RoundCurrentDirection: - case NI_SSE42_RoundCurrentDirectionScalar: + case NI_X86Base_RoundCurrentDirection: + case NI_X86Base_RoundCurrentDirectionScalar: case NI_AVX_RoundCurrentDirection: { assert(varTypeIsFloating(simdBaseType)); return static_cast(FloatRoundingMode::CurrentDirection); } - case NI_SSE42_RoundToNearestInteger: - case NI_SSE42_RoundToNearestIntegerScalar: + case NI_X86Base_RoundToNearestInteger: + case NI_X86Base_RoundToNearestIntegerScalar: case NI_AVX_RoundToNearestInteger: { assert(varTypeIsFloating(simdBaseType)); return static_cast(FloatRoundingMode::ToNearestInteger); } - case NI_SSE42_RoundToZero: - case NI_SSE42_RoundToZeroScalar: + case NI_X86Base_RoundToZero: + case NI_X86Base_RoundToZeroScalar: case NI_AVX_RoundToZero: { assert(varTypeIsFloating(simdBaseType)); @@ -1803,11 +1801,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } - if ((simdSize < 32) && !compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - break; - } - op1 = impSIMDPopStack(); retNode = gtNewSimdCeilNode(retType, op1, simdBaseJitType, simdSize); break; @@ -1862,11 +1855,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); assert(simdBaseType == TYP_FLOAT); - if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - op1 = impSIMDPopStack(); - retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_INT, simdBaseJitType, simdSize); - } + op1 = impSIMDPopStack(); + retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_INT, simdBaseJitType, simdSize); break; } @@ -2326,8 +2316,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - if ((simdSize == 64) || varTypeIsByte(simdBaseType) || varTypeIsLong(simdBaseType) || - (varTypeIsInt(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSE42))) + if ((simdSize == 64) || varTypeIsByte(simdBaseType) || varTypeIsLong(simdBaseType)) { // The lowering for Dot doesn't handle these cases, so import as Sum(left * right) retNode = gtNewSimdBinOpNode(GT_MUL, simdType, op1, op2, simdBaseJitType, simdSize); @@ -2467,14 +2456,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(op1 != nullptr); retNode = gtNewSimdHWIntrinsicNode(retType, op1, moveMaskIntrinsic, simdBaseJitType, simdSize); - - if ((simdSize == 16) && varTypeIsShort(simdBaseType)) - { - if (!compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - retNode->AsHWIntrinsic()->SetMethodHandle(this, method R2RARG(*entryPoint)); - } - } } break; } @@ -2491,11 +2472,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } - if ((simdSize < 32) && !compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - break; - } - op1 = impSIMDPopStack(); retNode = gtNewSimdFloorNode(retType, op1, simdBaseJitType, simdSize); break; @@ -2561,41 +2537,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); - op2 = impStackTop(0).val; - - switch (simdBaseType) - { - case TYP_BYTE: - case TYP_UBYTE: - case TYP_INT: - case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: - { - if (!op2->IsIntegralConst(0) && !compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // Using software fallback if simdBaseType is not supported by hardware - return nullptr; - } - break; - } - - case TYP_DOUBLE: - case TYP_FLOAT: - case TYP_SHORT: - case TYP_USHORT: - { - // short/ushort/float/double is supported by SSE2 - break; - } - - default: - { - unreached(); - } - } - - impPopStack(); + op2 = impPopStack().val; op1 = impSIMDPopStack(); retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize); @@ -2752,10 +2694,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); - if ((simdSize == 16) && !compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - break; - } if ((simdSize == 32) && !compOpportunisticallyDependsOn(InstructionSet_AVX2)) { break; @@ -3586,11 +3524,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } - if ((simdSize < 32) && !compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - break; - } - op1 = impSIMDPopStack(); retNode = gtNewSimdRoundNode(retType, op1, simdBaseJitType, simdSize); break; @@ -3981,11 +3914,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } - if ((simdSize < 32) && !compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - break; - } - op1 = impSIMDPopStack(); retNode = gtNewSimdTruncNode(retType, op1, simdBaseJitType, simdSize); break; @@ -4040,42 +3968,17 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector512_WithElement: { assert(sig->numArgs == 3); - GenTree* indexOp = impStackTop(1).val; - switch (simdBaseType) + if (varTypeIsLong(simdBaseType)) { - // Using software fallback if simdBaseType is not supported by hardware - case TYP_BYTE: - case TYP_UBYTE: - case TYP_INT: - case TYP_UINT: - if (!compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - return nullptr; - } - break; - - case TYP_LONG: - case TYP_ULONG: - if (!compOpportunisticallyDependsOn(InstructionSet_SSE42_X64)) - { - return nullptr; - } - break; - - case TYP_DOUBLE: - case TYP_FLOAT: - case TYP_SHORT: - case TYP_USHORT: - // short/ushort/float/double is supported by SSE2 - break; - - default: - unreached(); + if (!compOpportunisticallyDependsOn(InstructionSet_X86Base_X64)) + { + return nullptr; + } } - GenTree* valueOp = impPopStack().val; - impPopStack(); // Pop the indexOp now that we know its valid + GenTree* valueOp = impPopStack().val; + GenTree* indexOp = impPopStack().val; GenTree* vectorOp = impSIMDPopStack(); retNode = gtNewSimdWithElementNode(retType, vectorOp, indexOp, valueOp, simdBaseJitType, simdSize); @@ -4914,7 +4817,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } - case NI_SSE42_BlendVariable: + case NI_X86Base_BlendVariable: case NI_AVX_BlendVariable: case NI_AVX2_BlendVariable: case NI_AVX512_BlendVariable: @@ -5021,7 +4924,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } case NI_X86Base_CompareEqual: - case NI_SSE42_CompareEqual: case NI_AVX_CompareEqual: case NI_AVX2_CompareEqual: case NI_AVX512_CompareEqual: @@ -5042,7 +4944,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } case NI_X86Base_CompareGreaterThan: - case NI_SSE42_CompareGreaterThan: case NI_AVX_CompareGreaterThan: case NI_AVX2_CompareGreaterThan: case NI_AVX512_CompareGreaterThan: @@ -5082,7 +4983,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } case NI_X86Base_CompareLessThan: - case NI_SSE42_CompareLessThan: case NI_AVX_CompareLessThan: case NI_AVX2_CompareLessThan: case NI_AVX512_CompareLessThan: diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index f69407f58bb049..b797e7daef7ecf 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -5842,27 +5842,24 @@ GenTree* Compiler::impPrimitiveNamedIntrinsic(NamedIntrinsic intrinsic, #if defined(FEATURE_HW_INTRINSICS) #if defined(TARGET_XARCH) - if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - GenTree* op2 = impPopStack().val; - GenTree* op1 = impPopStack().val; + GenTree* op2 = impPopStack().val; + GenTree* op1 = impPopStack().val; - if (varTypeIsLong(baseType)) - { - hwintrinsic = NI_SSE42_X64_Crc32; - op1 = gtFoldExpr(gtNewCastNode(baseType, op1, /* unsigned */ true, baseType)); - } - else - { - hwintrinsic = NI_SSE42_Crc32; - baseType = genActualType(baseType); - } + if (varTypeIsLong(baseType)) + { + hwintrinsic = NI_X86Base_X64_Crc32; + op1 = gtFoldExpr(gtNewCastNode(baseType, op1, /* unsigned */ true, baseType)); + } + else + { + hwintrinsic = NI_X86Base_Crc32; + baseType = genActualType(baseType); + } - result = gtNewScalarHWIntrinsicNode(baseType, op1, op2, hwintrinsic); + result = gtNewScalarHWIntrinsicNode(baseType, op1, op2, hwintrinsic); - // We use the simdBaseJitType to bring the type of the second argument to codegen - result->AsHWIntrinsic()->SetSimdBaseJitType(baseJitType); - } + // We use the simdBaseJitType to bring the type of the second argument to codegen + result->AsHWIntrinsic()->SetSimdBaseJitType(baseJitType); #elif defined(TARGET_ARM64) if (compOpportunisticallyDependsOn(InstructionSet_Crc32)) { @@ -6107,14 +6104,11 @@ GenTree* Compiler::impPrimitiveNamedIntrinsic(NamedIntrinsic intrinsic, } #elif defined(FEATURE_HW_INTRINSICS) #if defined(TARGET_XARCH) - if (compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // Pop the value from the stack - impPopStack(); + // Pop the value from the stack + impPopStack(); - hwintrinsic = varTypeIsLong(baseType) ? NI_SSE42_X64_PopCount : NI_SSE42_PopCount; - result = gtNewScalarHWIntrinsicNode(baseType, op1, hwintrinsic); - } + hwintrinsic = varTypeIsLong(baseType) ? NI_X86Base_X64_PopCount : NI_X86Base_PopCount; + result = gtNewScalarHWIntrinsicNode(baseType, op1, hwintrinsic); #elif defined(TARGET_ARM64) // TODO-ARM64-CQ: PopCount should be handled as an intrinsic for non-constant cases #endif // TARGET_* @@ -8309,6 +8303,8 @@ bool Compiler::IsTargetIntrinsic(NamedIntrinsic intrinsicName) // instructions to directly compute round/ceiling/floor/truncate. case NI_System_Math_Abs: + case NI_System_Math_Ceiling: + case NI_System_Math_Floor: case NI_System_Math_Max: case NI_System_Math_MaxMagnitude: case NI_System_Math_MaxMagnitudeNumber: @@ -8322,14 +8318,10 @@ bool Compiler::IsTargetIntrinsic(NamedIntrinsic intrinsicName) case NI_System_Math_MultiplyAddEstimate: case NI_System_Math_ReciprocalEstimate: case NI_System_Math_ReciprocalSqrtEstimate: - case NI_System_Math_Sqrt: - return true; - - case NI_System_Math_Ceiling: - case NI_System_Math_Floor: case NI_System_Math_Round: + case NI_System_Math_Sqrt: case NI_System_Math_Truncate: - return compOpportunisticallyDependsOn(InstructionSet_SSE42); + return true; case NI_System_Math_FusedMultiplyAdd: return compOpportunisticallyDependsOn(InstructionSet_AVX2); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 844a621b5cb563..5e918a8481823f 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1103,7 +1103,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(instruction ins, GenTree* op) var_types simdBaseType = hwintrinsic->GetSimdBaseType(); switch (intrinsicId) { - case NI_SSE42_LoadAndDuplicateToVector128: + case NI_X86Base_LoadAndDuplicateToVector128: case NI_AVX_BroadcastScalarToVector128: case NI_AVX_BroadcastScalarToVector256: { @@ -1127,13 +1127,13 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(instruction ins, GenTree* op) } } - case NI_SSE42_MoveAndDuplicate: + case NI_X86Base_MoveAndDuplicate: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512_BroadcastScalarToVector512: { assert(hwintrinsic->isContained()); - if (intrinsicId == NI_SSE42_MoveAndDuplicate) + if (intrinsicId == NI_X86Base_MoveAndDuplicate) { assert(simdBaseType == TYP_DOUBLE); } diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 9e833eb52d6356..5eeb20bb5844c5 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -217,15 +217,21 @@ INSTMUL(imul_31, "imul", IUM_RD, BAD_CODE, 0xD54400003868 #define VEX3FLT(c1,c2) PACK4(c1, 0xc5, 0x02, c2) #define FIRST_SSE_INSTRUCTION INS_addpd -// Instructions for SSE, SSE2 +// Instructions for SSE, SSE2, SSE3, SSSE3, SSE41, SSE42, POPCNT INST3(addpd, "vaddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed doubles INST3(addps, "vaddps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), 4C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed singles INST3(addsd, "vaddsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles INST3(addss, "vaddss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles +INST3(addsubpd, "vaddsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles +INST3(addsubps, "vaddsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles INST3(andnpd, "vandnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles INST3(andnps, "vandnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles INST3(andpd, "vandpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles INST3(andps, "vandps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles +INST3(blendpd, "vblendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values +INST3(blendps, "vblendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values +INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Doubles +INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Singles INST3(cmppd, "vcmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), 4C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // compare packed doubles INST3(cmpps, "vcmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), 4C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // compare packed singles INST3(cmpsd, "vcmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // compare scalar doubles @@ -258,6 +264,15 @@ INST3(divpd, "vdivpd", IUM_WR, BAD_CODE, BAD_CODE, INST3(divps, "vdivps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), 11C, 3C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed singles INST3(divsd, "vdivsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5E), 13C, 4C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar doubles INST3(divss, "vdivss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5E), 11C, 3C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar singles +INST3(dppd, "vdppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), 9C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs +INST3(dpps, "vdpps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x40), 13C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two float vector regs +INST3(extractps, "vextractps", IUM_WR, SSE3A(0x17), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Extract Packed Floating-Point Values +INST3(haddpd, "vhaddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7C), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed doubles +INST3(haddps, "vhaddps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7C), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed floats +INST3(hsubpd, "vhsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles +INST3(hsubps, "vhsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats +INST3(insertps, "vinsertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value +INST3(lddqu, "vlddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Load Unaligned integer INST3(lfence, "lfence", IUM_RD, 0x000FE8AE, BAD_CODE, BAD_CODE, ZERO, 4C, INS_TT_NONE, REX_WIG) INST3(maskmovdqu, "vmaskmovdqu", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF7), 400C, 6C, INS_TT_NONE, REX_WIG | Encoding_VEX) INST3(maxpd, "vmaxpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5F), 4C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed doubles @@ -273,6 +288,7 @@ INST3(movapd, "vmovapd", IUM_WR, PCKDBL(0x29), BAD_CODE, INST3(movaps, "vmovaps", IUM_WR, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) INST3(movd32, "vmovd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move DWORD between xmm regs <-> memory/r32 regs INST3(movd64, "vmovq", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move QWORD between xmm regs <-> memory/r64 regs +INST3(movddup, "vmovddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), ILLEGAL, ILLEGAL, INS_TT_MOVDDUP, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate Double FP Values INST3(movdqa32, "vmovdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 | INS_FLAGS_HasPseudoName) INST3(movdqu32, "vmovdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 | INS_FLAGS_HasPseudoName) INST3(movhlps, "vmovhlps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x12), 1C, 1C, INS_TT_NONE, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) @@ -284,23 +300,31 @@ INST3(movlps, "vmovlps", IUM_WR, PCKFLT(0x13), BAD_CODE, INST3(movmskpd, "vmovmskpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x50), ILLEGAL, ILLEGAL, INS_TT_NONE, REX_WIG | Encoding_VEX) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros. INST3(movmskps, "vmovmskps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x50), ILLEGAL, ILLEGAL, INS_TT_NONE, REX_WIG | Encoding_VEX) INST3(movntdq, "vmovntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movntdqa, "vmovntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Load Double Quadword Non-Temporal Aligned Hint INST3(movnti32, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_REX2) INST3(movnti64, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_REX2) INST3(movntpd, "vmovntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) INST3(movntps, "vmovntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) INST3(movq, "vmovq", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move Quadword between memory/mm <-> regs INST3(movsd_simd, "vmovsd", IUM_WR, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movshdup, "vmovshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate odd-indexed Single FP Values +INST3(movsldup, "vmovsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate even-indexed Single FP Values INST3(movss, "vmovss", IUM_WR, SSEFLT(0x11), BAD_CODE, SSEFLT(0x10), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movupd, "vmovupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) INST3(movups, "vmovups", IUM_WR, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(mpsadbw, "vmpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), 4C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference INST3(mulpd, "vmulpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x59), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed doubles INST3(mulps, "vmulps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x59), 4C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed singles INST3(mulsd, "vmulsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x59), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar doubles INST3(mulss, "vmulss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x59), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar single INST3(orpd, "vorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x56), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed doubles INST3(orps, "vorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed singles +INST3(pabsb, "vpabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of bytes +INST3(pabsd, "vpabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 32-bit integers +INST3(pabsw, "vpabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 16-bit integers INST3(packssdw, "vpackssdw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6B), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to short with saturation INST3(packsswb, "vpacksswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation +INST3(packusdw, "vpackusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation INST3(packuswb, "vpackuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation INST3(paddb, "vpaddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers INST3(paddd, "vpaddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers @@ -310,26 +334,68 @@ INST3(paddsw, "vpaddsw", IUM_WR, BAD_CODE, BAD_CODE, INST3(paddusb, "vpaddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results INST3(paddusw, "vpaddusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned word integers and saturate the results INST3(paddw, "vpaddw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed word (16-bit) integers +INST3(palignr, "vpalignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right INST3(pandd, "vpand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise AND of two xmm regs INST3(pandnd, "vpandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise AND NOT of two xmm regs INST3(pavgb, "vpavgb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE0), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed byte integers INST3(pavgw, "vpavgw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE3), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed word integers +INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Bytes +INST3(pblendw, "vpblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), 1C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words INST3(pcmpeqb, "vpcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality INST3(pcmpeqd, "vpcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), 1C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality +INST3(pcmpeqq, "vpcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), 1C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality INST3(pcmpeqw, "vpcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality INST3(pcmpgtb, "vpcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than INST3(pcmpgtd, "vpcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), 1C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than +INST3(pcmpgtq, "vpcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), 3C, 1C, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality INST3(pcmpgtw, "vpcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than -INST3(pextrw, "vpextrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC5), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract 16-bit value into a r32 with zero extended to 32-bits +INST3(pextrb, "vpextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Byte +INST3(pextrd, "vpextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Dword +INST3(pextrq, "vpextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // Extract Qword +INST3(phaddd, "vphaddd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x02), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add +INST3(pextrw, "vpextrw", IUM_WR, SSE3A(0x15), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Word +INST3(phaddsw, "vphaddsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x03), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers with saturation +INST3(phaddw, "vphaddw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x01), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers +INST3(phminposuw, "vphminposuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x41), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Packed Horizontal Word Minimum +INST3(phsubd, "vphsubd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x06), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 32-bit integers +INST3(phsubsw, "vphsubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x07), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers with saturation +INST3(phsubw, "vphsubw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x05), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers +INST3(pinsrb, "vpinsrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x20), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Byte +INST3(pinsrd, "vpinsrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Dword +INST3(pinsrq, "vpinsrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Qword INST3(pinsrw, "vpinsrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC4), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert word at index +INST3(pmaddubsw, "vpmaddubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x04), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Packed Signed and Unsigned Bytes INST3(pmaddwd, "vpmaddwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF5), 5C, 2X, INS_TT_FULL_MEM, KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst +INST3(pmaxsb, "vpmaxsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed bytes +INST3(pmaxsd, "vpmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit signed integers INST3(pmaxsw, "vpmaxsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEE), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed words INST3(pmaxub, "vpmaxub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDE), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum unsigned bytes +INST3(pmaxud, "vpmaxud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit unsigned integers +INST3(pmaxuw, "vpmaxuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3E), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 16-bit unsigned integers +INST3(pminsb, "vpminsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x38), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed bytes +INST3(pminsd, "vpminsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit signed integers INST3(pminsw, "vpminsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEA), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed words INST3(pminub, "vpminub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDA), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum unsigned bytes +INST3(pminud, "vpminud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit unsigned integers +INST3(pminuw, "vpminuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3A), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 16-bit unsigned integers INST3(pmovmskb, "vpmovmskb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD7), ILLEGAL, ILLEGAL, INS_TT_NONE, REX_WIG | Encoding_VEX) // Move the MSB bits of all bytes in a xmm reg to an int reg +INST3(pmovsxbd, "vpmovsxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x21), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_8Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to int +INST3(pmovsxbq, "vpmovsxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x22), ILLEGAL, ILLEGAL, INS_TT_EIGHTH_MEM, Input_8Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to long +INST3(pmovsxbw, "vpmovsxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x20), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_8Bit | KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to short +INST3(pmovsxdq, "vpmovsxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x25), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_32Bit | KMask_Base2 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed sign extend int to long +INST3(pmovsxwd, "vpmovsxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x23), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to int +INST3(pmovsxwq, "vpmovsxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x24), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to long +INST3(pmovzxbd, "vpmovzxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x31), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_8Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to intg +INST3(pmovzxbq, "vpmovzxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x32), ILLEGAL, ILLEGAL, INS_TT_EIGHTH_MEM, Input_8Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to lon +INST3(pmovzxbw, "vpmovzxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x30), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_8Bit | KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to short +INST3(pmovzxdq, "vpmovzxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x35), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_32Bit | KMask_Base2 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed zero extend int to long +INST3(pmovzxwd, "vpmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x33), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to int +INST3(pmovzxwq, "vpmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to long +INST3(pmuldq, "vpmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result +INST3(pmulhrsw, "vpmulhrsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0B), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply High with Round and Scale INST3(pmulhuw, "vpmulhuw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE4), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit unsigned integers INST3(pmulhw, "vpmulhw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE5), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit signed integers +INST3(pmulld, "vpmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), 10C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result INST3(pmullw, "vpmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result INST3(pmuludq, "vpmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result INST3(pord, "vpor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise OR of two xmm regs @@ -338,9 +404,13 @@ INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE, INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, ZERO, 2X, INS_TT_TUPLE1_FIXED, Input_8Bit | REX_WIG | Encoding_REX2) INST3(prefetcht2, "prefetcht2", IUM_RD, 0x000F1818, BAD_CODE, BAD_CODE, ZERO, 2X, INS_TT_TUPLE1_FIXED, Input_8Bit | REX_WIG | Encoding_REX2) INST3(psadbw, "vpsadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), 3C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers +INST3(pshufb, "vpshufb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x00), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Shuffle Bytes INST3(pshufd, "vpshufd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x70), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed shuffle of 32-bit integers INST3(pshufhw, "vpshufhw", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x70), 1C, 1C, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. INST3(pshuflw, "vpshuflw", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x70), 1C, 1C, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1. +INST3(psignb, "vpsignb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x08), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN +INST3(psignd, "vpsignd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0A), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN +INST3(psignw, "vpsignw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x09), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN INST3(pslld, "vpslld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xF2), ILLEGAL, ILLEGAL, INS_TT_FULL | INS_TT_MEM128, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 32-bit integers INST3(pslldq, "vpslldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, 1C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift left logical of xmm reg by given number of bytes INST3(psllq, "vpsllq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xF3), ILLEGAL, ILLEGAL, INS_TT_FULL | INS_TT_MEM128, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 64-bit integers @@ -359,6 +429,7 @@ INST3(psubsw, "vpsubsw", IUM_WR, BAD_CODE, BAD_CODE, INST3(psubusb, "vpsubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation INST3(psubusw, "vpsubusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation INST3(psubw, "vpsubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers +INST3(ptest, "vptest", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x17), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed logical compare INST3(punpckhbw, "vpunpckhbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x68), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) INST3(punpckhdq, "vpunpckhdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6A), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) INST3(punpckhqdq, "vpunpckhqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6D), 1C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (hi) @@ -370,6 +441,10 @@ INST3(punpcklwd, "vpunpcklwd", IUM_WR, BAD_CODE, BAD_CODE, INST3(pxord, "vpxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise XOR of two xmm regs INST3(rcpps, "vrcpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x53), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Reciprocal of packed singles INST3(rcpss, "vrcpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x53), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal of scalar single +INST3(roundpd, "vroundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), 8C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed double precision floating-point values +INST3(roundps, "vroundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), 8C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed single precision floating-point values +INST3(roundsd, "vroundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Round scalar double precision floating-point values +INST3(roundss, "vroundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Round scalar single precision floating-point values INST3(rsqrtps, "vrsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x52), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Reciprocal Sqrt of packed singles INST3(rsqrtss, "vrsqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x52), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal Sqrt of scalar single INST3(sfence, "sfence", IUM_RD, 0x000FF8AE, BAD_CODE, BAD_CODE, ZERO, 6C, INS_TT_NONE, REX_WIG) @@ -392,84 +467,6 @@ INST3(unpcklps, "vunpcklps", IUM_WR, BAD_CODE, BAD_CODE, INST3(xorpd, "vxorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x57), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed doubles INST3(xorps, "vxorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles -// Instructions for SSE3, SSSE3, SSE41, SSE42, POPCNT -INST3(addsubpd, "vaddsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles -INST3(addsubps, "vaddsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles -INST3(blendpd, "vblendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values -INST3(blendps, "vblendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values -INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Doubles -INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Singles -INST3(dppd, "vdppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), 9C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs -INST3(dpps, "vdpps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x40), 13C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two float vector regs -INST3(extractps, "vextractps", IUM_WR, SSE3A(0x17), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Extract Packed Floating-Point Values -INST3(haddpd, "vhaddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7C), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed doubles -INST3(haddps, "vhaddps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7C), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed floats -INST3(hsubpd, "vhsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles -INST3(hsubps, "vhsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats -INST3(insertps, "vinsertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value -INST3(lddqu, "vlddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Load Unaligned integer -INST3(movddup, "vmovddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), ILLEGAL, ILLEGAL, INS_TT_MOVDDUP, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate Double FP Values -INST3(movntdqa, "vmovntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Load Double Quadword Non-Temporal Aligned Hint -INST3(movshdup, "vmovshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate odd-indexed Single FP Values -INST3(movsldup, "vmovsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate even-indexed Single FP Values -INST3(mpsadbw, "vmpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), 4C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference -INST3(pabsb, "vpabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of bytes -INST3(pabsd, "vpabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 32-bit integers -INST3(pabsw, "vpabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 16-bit integers -INST3(packusdw, "vpackusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation -INST3(palignr, "vpalignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right -INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Bytes -INST3(pblendw, "vpblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), 1C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words -INST3(pcmpeqq, "vpcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), 1C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality -INST3(pcmpgtq, "vpcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), 3C, 1C, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality -INST3(pextrb, "vpextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Byte -INST3(pextrd, "vpextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Dword -INST3(pextrq, "vpextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // Extract Qword -INST3(pextrw_sse42, "vpextrw", IUM_WR, SSE3A(0x15), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Word -INST3(phaddd, "vphaddd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x02), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add -INST3(phaddsw, "vphaddsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x03), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers with saturation -INST3(phaddw, "vphaddw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x01), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers -INST3(phminposuw, "vphminposuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x41), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Packed Horizontal Word Minimum -INST3(phsubd, "vphsubd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x06), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 32-bit integers -INST3(phsubsw, "vphsubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x07), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers with saturation -INST3(phsubw, "vphsubw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x05), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers -INST3(pinsrb, "vpinsrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x20), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Byte -INST3(pinsrd, "vpinsrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Dword -INST3(pinsrq, "vpinsrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Qword -INST3(pmaddubsw, "vpmaddubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x04), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Packed Signed and Unsigned Bytes -INST3(pmaxsb, "vpmaxsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed bytes -INST3(pmaxsd, "vpmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit signed integers -INST3(pmaxud, "vpmaxud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit unsigned integers -INST3(pmaxuw, "vpmaxuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3E), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 16-bit unsigned integers -INST3(pminsb, "vpminsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x38), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed bytes -INST3(pminsd, "vpminsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit signed integers -INST3(pminud, "vpminud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit unsigned integers -INST3(pminuw, "vpminuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3A), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 16-bit unsigned integers -INST3(pmovsxbd, "vpmovsxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x21), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_8Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to int -INST3(pmovsxbq, "vpmovsxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x22), ILLEGAL, ILLEGAL, INS_TT_EIGHTH_MEM, Input_8Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to long -INST3(pmovsxbw, "vpmovsxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x20), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_8Bit | KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to short -INST3(pmovsxdq, "vpmovsxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x25), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_32Bit | KMask_Base2 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed sign extend int to long -INST3(pmovsxwd, "vpmovsxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x23), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to int -INST3(pmovsxwq, "vpmovsxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x24), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to long -INST3(pmovzxbd, "vpmovzxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x31), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_8Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to intg -INST3(pmovzxbq, "vpmovzxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x32), ILLEGAL, ILLEGAL, INS_TT_EIGHTH_MEM, Input_8Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to lon -INST3(pmovzxbw, "vpmovzxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x30), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_8Bit | KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to short -INST3(pmovzxdq, "vpmovzxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x35), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_32Bit | KMask_Base2 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed zero extend int to long -INST3(pmovzxwd, "vpmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x33), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to int -INST3(pmovzxwq, "vpmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to long -INST3(pmuldq, "vpmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result -INST3(pmulhrsw, "vpmulhrsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0B), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply High with Round and Scale -INST3(pmulld, "vpmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), 10C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result -INST3(pshufb, "vpshufb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x00), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Shuffle Bytes -INST3(psignb, "vpsignb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x08), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(psignd, "vpsignd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0A), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(psignw, "vpsignw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x09), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN -INST3(ptest, "vptest", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x17), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed logical compare -INST3(roundpd, "vroundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), 8C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed double precision floating-point values -INST3(roundps, "vroundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), 8C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed single precision floating-point values -INST3(roundsd, "vroundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Round scalar double precision floating-point values -INST3(roundss, "vroundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Round scalar single precision floating-point values - // Instructions for AESNI, PCLMULQDQ INST3(aesdec, "vaesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow INST3(aesdeclast, "vaesdeclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDF), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES decryption flow diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index f5d05b2a95aa66..a2b03e6377c034 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -394,7 +394,6 @@ RELEASE_CONFIG_INTEGER(EnableHWIntrinsic, "EnableHWIntrinsic", #endif // defined(TARGET_LOONGARCH64) #if defined(TARGET_AMD64) || defined(TARGET_X86) -RELEASE_CONFIG_INTEGER(EnableSSE42, "EnableSSE42", 1) // Allows SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, and dependent hardware intrinsics to be disabled RELEASE_CONFIG_INTEGER(EnableAVX, "EnableAVX", 1) // Allows AVX and dependent hardware intrinsics to be disabled RELEASE_CONFIG_INTEGER(EnableAVX2, "EnableAVX2", 1) // Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled RELEASE_CONFIG_INTEGER(EnableAVX512, "EnableAVX512", 1) // Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 9b1889e1673b8e..9e52cd07f388ff 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -7262,8 +7262,8 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par } #ifdef TARGET_ARM64 - const bool hasRcpc2 = comp->compOpportunisticallyDependsOn(InstructionSet_Rcpc2); - if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile() && !hasRcpc2) + if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile() && + !comp->compOpportunisticallyDependsOn(InstructionSet_Rcpc2)) { // For Arm64 we avoid using LEA for volatile INDs // because we won't be able to use ldar/star @@ -7306,7 +7306,8 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par // Generally, we try to avoid creating addressing modes for volatile INDs so we can then use // ldar/stlr instead of ldr/str + dmb. Although, with Arm 8.4+'s RCPC2 we can handle unscaled // addressing modes (if the offset fits into 9 bits) - assert(hasRcpc2); + assert(comp->compIsaSupportedDebugOnly(InstructionSet_Rcpc2)); + if ((scale > 1) || (!emitter::emitIns_valid_imm_for_unscaled_ldst_offset(offset)) || (index != nullptr)) { return false; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index db9c8db0ba9f33..f44117ba0ce00e 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1083,37 +1083,12 @@ void Lowering::LowerCast(GenTree* tree) // in range, with bits left over. e.g. we might have a value of 4294967295.9999995. // We must, therefore, truncate the value before wrapping it to negative. - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // This creates the equivalent of the following C# code: - // floorVal = Sse41.RoundToZeroScalar(srcVector); + // This creates the equivalent of the following C# code: + // floorVal = Sse41.RoundToZeroScalar(srcVector); - floorVal = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, NI_SSE42_RoundToZeroScalar, - srcBaseType, 16); - castRange.InsertAtEnd(floorVal); - } - else - { - // We don't have `roundsd` available, but we can truncate the value by simply zeroing out - // the low 21 bits of the double. This works because we know we will only use the negative - // value when the exponent is exactly 31, meaning 31 of the 52 bits in the significand are - // used for the whole portion of the number, and the remaining 21 bits are fractional. - // - // This creates the equivalent of the following C# code: - // floorVal = ((srcVector.AsUInt64() >>> 21) << 21).AsDouble(); - - GenTree* twentyOne = comp->gtNewIconNode(21); - GenTree* rightShift = comp->gtNewSimdBinOpNode(GT_RSZ, TYP_SIMD16, floorVal, twentyOne, - CORINFO_TYPE_ULONG, 16); - castRange.InsertAtEnd(twentyOne); - castRange.InsertAtEnd(rightShift); - - twentyOne = comp->gtClone(twentyOne); - floorVal = comp->gtNewSimdBinOpNode(GT_LSH, TYP_SIMD16, rightShift, twentyOne, - CORINFO_TYPE_ULONG, 16); - castRange.InsertAtEnd(twentyOne); - castRange.InsertAtEnd(floorVal); - } + floorVal = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, NI_X86Base_RoundToZeroScalar, + srcBaseType, 16); + castRange.InsertAtEnd(floorVal); } GenTree* wrapVal = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, maxFloatingValue, @@ -1148,42 +1123,16 @@ void Lowering::LowerCast(GenTree* tree) GenTree* resultClone = comp->gtClone(result); castRange.InsertAtEnd(resultClone); - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // If the conversion of the fixed-up value overflowed, the result wil be - // int.MinValue. Since `blendvps` uses only the MSB for result selection, - // this is adequate to force selection of the negated result. - // - // This creates the equivalent of the following C# code: - // convertResult = Sse41.BlendVariable(result, negated, result); + // If the conversion of the fixed-up value overflowed, the result wil be + // int.MinValue. Since `blendvps` uses only the MSB for result selection, + // this is adequate to force selection of the negated result. + // + // This creates the equivalent of the following C# code: + // convertResult = Sse41.BlendVariable(result, negated, result); - convertResult = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, result, negated, resultClone, - NI_SSE42_BlendVariable, CORINFO_TYPE_FLOAT, 16); - } - else - { - // If we can't use `blendvps`, we do a bit-wise selection. This works - // using only and+or because if we choose the negated value, both it - // and the overflowed result have MSB set. - // - // This creates the equivalent of the following C# code: - // var mask = Sse2.ShiftRightArithmetic(result, 31); - // convertResult = Sse.Or(result, Sse.And(negated, mask)); - - GenTree* thirtyOne = comp->gtNewIconNode(31); - GenTree* mask = - comp->gtNewSimdBinOpNode(GT_RSH, TYP_SIMD16, result, thirtyOne, CORINFO_TYPE_INT, 16); - GenTree* andMask = - comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, mask, negated, dstBaseType, 16); - - castRange.InsertAtEnd(thirtyOne); - castRange.InsertAtEnd(mask); - castRange.InsertAtEnd(andMask); - - convertResult = - comp->gtNewSimdBinOpNode(GT_OR, TYP_SIMD16, andMask, resultClone, dstBaseType, 16); - } + convertResult = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, result, negated, resultClone, + NI_X86Base_BlendVariable, CORINFO_TYPE_FLOAT, 16); // Because the results are in a SIMD register, we need to ToScalar() them out. castRange.InsertAtEnd(convertResult); @@ -1321,7 +1270,7 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn } break; - case NI_SSE42_PTEST: + case NI_X86Base_PTEST: case NI_AVX_PTEST: { // If we need the Carry flag then we can't swap operands. @@ -2223,7 +2172,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return LowerHWIntrinsicToScalar(node); } - case NI_SSE42_Extract: + case NI_X86Base_Extract: { if (varTypeIsFloating(node->GetSimdBaseType())) { @@ -2253,7 +2202,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE42_Insert: + case NI_X86Base_Insert: { assert(node->GetOperandCount() == 3); @@ -2397,7 +2346,8 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); - if ((op1Intrinsic->GetHWIntrinsicId() != NI_SSE42_Insert) || (op1Intrinsic->GetSimdBaseType() != TYP_FLOAT)) + if ((op1Intrinsic->GetHWIntrinsicId() != NI_X86Base_Insert) || + (op1Intrinsic->GetSimdBaseType() != TYP_FLOAT)) { // Nothing to do if op1 isn't a float32 Sse41.Insert break; @@ -2529,7 +2479,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) } case NI_X86Base_CompareLessThan: - case NI_SSE42_CompareLessThan: case NI_AVX2_CompareLessThan: { if (varTypeIsFloating(node->GetSimdBaseType())) @@ -2549,12 +2498,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE42_CompareLessThan: - { - newIntrinsicId = NI_SSE42_CompareGreaterThan; - break; - } - case NI_AVX2_CompareLessThan: { newIntrinsicId = NI_AVX2_CompareGreaterThan; @@ -2613,14 +2556,14 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerHWIntrinsicCC(node, NI_X86Base_UCOMIS, GenCondition::FGT); break; - case NI_SSE42_TestC: - LowerHWIntrinsicCC(node, NI_SSE42_PTEST, GenCondition::C); + case NI_X86Base_TestC: + LowerHWIntrinsicCC(node, NI_X86Base_PTEST, GenCondition::C); break; - case NI_SSE42_TestZ: - LowerHWIntrinsicCC(node, NI_SSE42_PTEST, GenCondition::EQ); + case NI_X86Base_TestZ: + LowerHWIntrinsicCC(node, NI_X86Base_PTEST, GenCondition::EQ); break; - case NI_SSE42_TestNotZAndNotC: - LowerHWIntrinsicCC(node, NI_SSE42_PTEST, GenCondition::UGT); + case NI_X86Base_TestNotZAndNotC: + LowerHWIntrinsicCC(node, NI_X86Base_PTEST, GenCondition::UGT); break; case NI_AVX_TestC: @@ -2739,10 +2682,9 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm { bool isOp2VectorZero = op2->IsVectorZero(); - if ((isOp2VectorZero || op2->IsVectorAllBitsSet()) && - comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) + if (isOp2VectorZero || op2->IsVectorAllBitsSet()) { - // On SSE4.2 or higher we can optimize comparisons against Zero or AllBitsSet to + // We can optimize comparisons against Zero or AllBitsSet to // just use PTEST. We can't support it for floating-point, however, as it has // both +0.0 and -0.0 where +0.0 == -0.0 @@ -2856,7 +2798,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm else { assert(simdSize == 16); - LowerHWIntrinsicCC(node, NI_SSE42_PTEST, cmpCnd); + LowerHWIntrinsicCC(node, NI_X86Base_PTEST, cmpCnd); } return LowerNode(node); } @@ -3204,7 +3146,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTreeHWIntrinsic* nestedIntrin = nestedOp2->AsHWIntrinsic(); NamedIntrinsic nestedIntrinId = nestedIntrin->GetHWIntrinsicId(); - if ((nestedIntrinId == NI_SSE42_MoveAndDuplicate) || + if ((nestedIntrinId == NI_X86Base_MoveAndDuplicate) || (nestedIntrinId == NI_AVX2_BroadcastScalarToVector128) || (nestedIntrinId == NI_AVX2_BroadcastScalarToVector256) || (nestedIntrinId == NI_AVX512_BroadcastScalarToVector512)) @@ -3410,15 +3352,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm { assert(simdSize == 16); - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - cmpIntrinsic = NI_SSE42_CompareEqual; - } - else - { - cmpIntrinsic = NI_X86Base_CompareEqual; - cmpJitType = CORINFO_TYPE_UINT; - } + cmpIntrinsic = NI_X86Base_CompareEqual; mskIntrinsic = NI_X86Base_MoveMask; mskConstant = 0xFFFF; } @@ -3639,13 +3573,12 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) blendVariableId = NI_AVX2_BlendVariable; } } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) + else { // For Vector128, BlendVariable is available on SSE41 - blendVariableId = NI_SSE42_BlendVariable; + blendVariableId = NI_X86Base_BlendVariable; } - // If blendVariableId has been set, the architecture supports BlendVariable, so we can optimize if (blendVariableId != NI_Illegal) { // result = BlendVariable op3 (right) op2 (left) op1 (mask) @@ -3875,7 +3808,6 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) } case NI_X86Base_CompareEqual: - case NI_SSE42_CompareEqual: case NI_AVX_CompareEqual: case NI_AVX2_CompareEqual: { @@ -3884,7 +3816,6 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) } case NI_X86Base_CompareGreaterThan: - case NI_SSE42_CompareGreaterThan: case NI_AVX_CompareGreaterThan: case NI_AVX2_CompareGreaterThan: { @@ -3900,7 +3831,6 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) } case NI_X86Base_CompareLessThan: - case NI_SSE42_CompareLessThan: case NI_AVX_CompareLessThan: case NI_AVX2_CompareLessThan: { @@ -4446,59 +4376,24 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) case TYP_BYTE: case TYP_UBYTE: { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // We will be constructing the following parts: - // ... - // tmp2 = CNS_VEC simd16 0 - // /--* tmp1 simd16 - // +--* tmp2 simd16 - // node = * HWINTRINSIC simd16 ubyte Shuffle - - // This is roughly the following managed code: - // ... - // var tmp2 = Vector128.Zero; - // return Ssse3.Shuffle(tmp1, tmp2); - - tmp2 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertAfter(tmp1, tmp2); - LowerNode(tmp2); - - node->ResetHWIntrinsicId(NI_SSE42_Shuffle, tmp1, tmp2); - break; - } - // We will be constructing the following parts: // ... - // /--* tmp1 simd16 - // * STORE_LCL_VAR simd16 - // tmp1 = LCL_VAR simd16 - // tmp2 = LCL_VAR simd16 - // /--* tmp1 simd16 - // +--* tmp2 simd16 - // tmp1 = * HWINTRINSIC simd16 ubyte UnpackLow - // ... + // tmp2 = CNS_VEC simd16 0 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // node = * HWINTRINSIC simd16 ubyte Shuffle // This is roughly the following managed code: // ... - // var tmp2 = tmp1; - // tmp1 = Sse2.UnpackLow(tmp1, tmp2); - // ... - - node->Op(1) = tmp1; - LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); - ReplaceWithLclVar(tmp1Use); - tmp1 = node->Op(1); + // var tmp2 = Vector128.Zero; + // return Ssse3.Shuffle(tmp1, tmp2); - tmp2 = comp->gtClone(tmp1); + tmp2 = comp->gtNewZeroConNode(simdType); BlockRange().InsertAfter(tmp1, tmp2); + LowerNode(tmp2); - tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_X86Base_UnpackLow, CORINFO_TYPE_UBYTE, - simdSize); - BlockRange().InsertAfter(tmp2, tmp1); - LowerNode(tmp1); - - FALLTHROUGH; + node->ResetHWIntrinsicId(NI_X86Base_Shuffle, tmp1, tmp2); + break; } case TYP_SHORT: @@ -4617,8 +4512,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) case TYP_ULONG: case TYP_DOUBLE: { - if ((IsContainableMemoryOp(op1) || simdBaseType == TYP_DOUBLE) && - comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) + if (IsContainableMemoryOp(op1) || (simdBaseType == TYP_DOUBLE)) { // We will be constructing the following parts: // ... @@ -4629,7 +4523,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // ... // return Sse3.MoveAndDuplicate(tmp1); - node->ChangeHWIntrinsicId(NI_SSE42_MoveAndDuplicate, tmp1); + node->ChangeHWIntrinsicId(NI_X86Base_MoveAndDuplicate, tmp1); node->SetSimdBaseJitType(CORINFO_TYPE_DOUBLE); break; } @@ -4759,178 +4653,54 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) case TYP_INT: case TYP_UINT: { - NamedIntrinsic insIntrinsic = NI_Illegal; + NamedIntrinsic insIntrinsic = NI_X86Base_Insert; - if ((simdBaseType == TYP_SHORT) || (simdBaseType == TYP_USHORT)) + for (size_t N = 1; N < argCnt - 1; N++) { - insIntrinsic = NI_X86Base_Insert; - } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - insIntrinsic = NI_SSE42_Insert; - } - - if (insIntrinsic != NI_Illegal) - { - for (size_t N = 1; N < argCnt - 1; N++) - { - // We will be constructing the following parts: - // ... - // idx = CNS_INT int N - // /--* tmp1 simd16 - // +--* opN T - // +--* idx int - // tmp1 = * HWINTRINSIC simd16 T Insert - // ... - - // This is roughly the following managed code: - // ... - // tmp1 = Sse?.Insert(tmp1, opN, N); - // ... - - GenTree* opN = node->Op(N + 1); - - idx = comp->gtNewIconNode(N, TYP_INT); - // Place the insert as early as possible to avoid creating a lot of long lifetimes. - GenTree* insertionPoint = LIR::LastNode(tmp1, opN); - - tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, opN, idx, insIntrinsic, simdBaseJitType, - simdSize); - BlockRange().InsertAfter(insertionPoint, idx, tmp1); - LowerNode(tmp1); - } - // We will be constructing the following parts: - // idx = CNS_INT int (argCnt - 1) - // /--* tmp1 simd16 - // +--* lastOp T - // +--* idx int - // node = * HWINTRINSIC simd16 T Insert + // ... + // idx = CNS_INT int N + // /--* tmp1 simd16 + // +--* opN T + // +--* idx int + // tmp1 = * HWINTRINSIC simd16 T Insert + // ... // This is roughly the following managed code: // ... - // tmp1 = Sse?.Insert(tmp1, lastOp, (argCnt - 1)); + // tmp1 = Sse?.Insert(tmp1, opN, N); // ... - GenTree* lastOp = node->Op(argCnt); - - idx = comp->gtNewIconNode((argCnt - 1), TYP_INT); - BlockRange().InsertAfter(lastOp, idx); - - node->ResetHWIntrinsicId(insIntrinsic, comp, tmp1, lastOp, idx); - break; - } - - assert((simdBaseType != TYP_SHORT) && (simdBaseType != TYP_USHORT)); - - GenTree* op[16]; - op[0] = tmp1; - - for (size_t N = 1; N < argCnt; N++) - { GenTree* opN = node->Op(N + 1); - op[N] = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, opN, simdBaseJitType, 16); - LowerNode(op[N]); - } + idx = comp->gtNewIconNode(N, TYP_INT); + // Place the insert as early as possible to avoid creating a lot of long lifetimes. + GenTree* insertionPoint = LIR::LastNode(tmp1, opN); - if ((simdBaseType == TYP_BYTE) || (simdBaseType == TYP_UBYTE)) - { - for (size_t N = 0; N < argCnt; N += 4) - { - // We will be constructing the following parts: - // ... - // /--* opN T - // opN = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opO T - // opO = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opN simd16 - // +--* opO simd16 - // tmp1 = * HWINTRINSIC simd16 T UnpackLow - // /--* opP T - // opP = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opQ T - // opQ = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opP simd16 - // +--* opQ simd16 - // tmp2 = * HWINTRINSIC simd16 T UnpackLow - // /--* tmp1 simd16 - // +--* tmp2 simd16 - // tmp3 = * HWINTRINSIC simd16 T UnpackLow - // ... - - // This is roughly the following managed code: - // ... - // tmp1 = Sse2.UnpackLow(opN, opO); - // tmp2 = Sse2.UnpackLow(opP, opQ); - // tmp3 = Sse2.UnpackLow(tmp1, tmp2); - // ... - - size_t O = N + 1; - size_t P = N + 2; - size_t Q = N + 3; - - tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[N], op[O], NI_X86Base_UnpackLow, - CORINFO_TYPE_UBYTE, simdSize); - BlockRange().InsertAfter(LIR::LastNode(op[N], op[O]), tmp1); - LowerNode(tmp1); - - tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[P], op[Q], NI_X86Base_UnpackLow, - CORINFO_TYPE_UBYTE, simdSize); - BlockRange().InsertAfter(LIR::LastNode(op[P], op[Q]), tmp2); - LowerNode(tmp2); - - tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_X86Base_UnpackLow, - CORINFO_TYPE_USHORT, simdSize); - BlockRange().InsertAfter(LIR::LastNode(tmp1, tmp2), tmp3); - LowerNode(tmp3); - - // This caches the result in index 0 through 3, depending on which - // loop iteration this is and allows the rest of the logic to be - // shared with the TYP_INT and TYP_UINT path. - - op[N / 4] = tmp3; - } + tmp1 = + comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, opN, idx, insIntrinsic, simdBaseJitType, simdSize); + BlockRange().InsertAfter(insertionPoint, idx, tmp1); + LowerNode(tmp1); } // We will be constructing the following parts: - // ... - // /--* opN T - // opN = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opO T - // opO = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opN simd16 - // +--* opO simd16 - // tmp1 = * HWINTRINSIC simd16 T UnpackLow - // /--* opP T - // opP = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opQ T - // opQ = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opP simd16 - // +--* opQ simd16 - // tmp2 = * HWINTRINSIC simd16 T UnpackLow - // /--* tmp1 simd16 - // +--* tmp2 simd16 - // node = * HWINTRINSIC simd16 T UnpackLow + // idx = CNS_INT int (argCnt - 1) + // /--* tmp1 simd16 + // +--* lastOp T + // +--* idx int + // node = * HWINTRINSIC simd16 T Insert // This is roughly the following managed code: // ... - // tmp1 = Sse2.UnpackLow(opN, opO); - // tmp2 = Sse2.UnpackLow(opP, opQ); - // return Sse2.UnpackLow(tmp1, tmp2); + // tmp1 = Sse?.Insert(tmp1, lastOp, (argCnt - 1)); + // ... - tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_X86Base_UnpackLow, CORINFO_TYPE_UINT, - simdSize); - BlockRange().InsertAfter(LIR::LastNode(op[0], op[1]), tmp1); - LowerNode(tmp1); + GenTree* lastOp = node->Op(argCnt); - tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_X86Base_UnpackLow, CORINFO_TYPE_UINT, - simdSize); - BlockRange().InsertAfter(LIR::LastNode(op[2], op[3]), tmp2); - LowerNode(tmp2); + idx = comp->gtNewIconNode((argCnt - 1), TYP_INT); + BlockRange().InsertAfter(lastOp, idx); - node->ResetHWIntrinsicId(NI_X86Base_UnpackLow, tmp1, tmp2); - node->SetSimdBaseJitType(CORINFO_TYPE_ULONG); + node->ResetHWIntrinsicId(insIntrinsic, comp, tmp1, lastOp, idx); break; } @@ -4939,49 +4709,11 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) unsigned N = 0; GenTree* opN = nullptr; - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - assert(argCnt <= 4); - GenTree* insertedNodes[4]; - - for (N = 1; N < argCnt - 1; N++) - { - // We will be constructing the following parts: - // ... - // - // /--* opN T - // tmp2 = * HWINTRINSIC simd16 T CreateScalarUnsafe - // idx = CNS_INT int N - // /--* tmp1 simd16 - // +--* opN T - // +--* idx int - // tmp1 = * HWINTRINSIC simd16 T Insert - // ... - - // This is roughly the following managed code: - // ... - // tmp2 = Vector128.CreateScalarUnsafe(opN); - // tmp1 = Sse41.Insert(tmp1, tmp2, N << 4); - // ... - - opN = node->Op(N + 1); - - tmp2 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, opN, simdBaseJitType, 16); - LowerNode(tmp2); - - idx = comp->gtNewIconNode(N << 4, TYP_INT); - - // Place the insert as early as possible to avoid creating a lot of long lifetimes. - GenTree* insertionPoint = LIR::LastNode(tmp1, tmp2); - - tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, idx, NI_SSE42_Insert, simdBaseJitType, - simdSize); - BlockRange().InsertAfter(insertionPoint, idx, tmp3); - - insertedNodes[N] = tmp3; - tmp1 = tmp3; - } + assert(argCnt <= 4); + GenTree* insertedNodes[4]; + for (N = 1; N < argCnt - 1; N++) + { // We will be constructing the following parts: // ... // @@ -4991,85 +4723,70 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // /--* tmp1 simd16 // +--* opN T // +--* idx int - // node = * HWINTRINSIC simd16 T Insert + // tmp1 = * HWINTRINSIC simd16 T Insert + // ... // This is roughly the following managed code: // ... // tmp2 = Vector128.CreateScalarUnsafe(opN); - // return Sse41.Insert(tmp1, tmp2, N << 4); + // tmp1 = Sse41.Insert(tmp1, tmp2, N << 4); + // ... - opN = node->Op(argCnt); + opN = node->Op(N + 1); tmp2 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, opN, simdBaseJitType, 16); LowerNode(tmp2); - idx = comp->gtNewIconNode((argCnt - 1) << 4, TYP_INT); - BlockRange().InsertAfter(tmp2, idx); + idx = comp->gtNewIconNode(N << 4, TYP_INT); - node->ResetHWIntrinsicId(NI_SSE42_Insert, comp, tmp1, tmp2, idx); + // Place the insert as early as possible to avoid creating a lot of long lifetimes. + GenTree* insertionPoint = LIR::LastNode(tmp1, tmp2); - for (N = 1; N < argCnt - 1; N++) - { - // LowerNode for NI_SSE42_Insert specially handles zeros, constants, and certain mask values - // to do the minimal number of operations and may merge together two neighboring inserts that - // don't have any side effects between them. Because of this and because of the interdependence - // of the inserts we've created above, we need to wait to lower the generated inserts until after - // we've completed the chain. - - GenTree* insertedNode = insertedNodes[N]; - LowerNode(insertedNode); - } - break; + tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, idx, NI_X86Base_Insert, simdBaseJitType, + simdSize); + BlockRange().InsertAfter(insertionPoint, idx, tmp3); + + insertedNodes[N] = tmp3; + tmp1 = tmp3; } // We will be constructing the following parts: // ... + // // /--* opN T - // opN = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opO T - // opO = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opN simd16 - // +--* opO simd16 - // tmp1 = * HWINTRINSIC simd16 T UnpackLow - // /--* opP T - // opP = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opQ T - // opQ = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* opP simd16 - // +--* opQ simd16 - // tmp2 = * HWINTRINSIC simd16 T UnpackLow + // tmp2 = * HWINTRINSIC simd16 T CreateScalarUnsafe + // idx = CNS_INT int N // /--* tmp1 simd16 - // +--* tmp2 simd16 - // node = * HWINTRINSIC simd16 T MoveLowToHigh + // +--* opN T + // +--* idx int + // node = * HWINTRINSIC simd16 T Insert // This is roughly the following managed code: // ... - // tmp1 = Sse.UnpackLow(opN, opO); - // tmp2 = Sse.UnpackLow(opP, opQ); - // return Sse.MoveLowToHigh(tmp1, tmp2); + // tmp2 = Vector128.CreateScalarUnsafe(opN); + // return Sse41.Insert(tmp1, tmp2, N << 4); - GenTree* op[4]; - op[0] = tmp1; + opN = node->Op(argCnt); - for (N = 1; N < argCnt; N++) - { - opN = node->Op(N + 1); + tmp2 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, opN, simdBaseJitType, 16); + LowerNode(tmp2); - op[N] = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, opN, simdBaseJitType, 16); - LowerNode(op[N]); - } + idx = comp->gtNewIconNode((argCnt - 1) << 4, TYP_INT); + BlockRange().InsertAfter(tmp2, idx); - tmp1 = - comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_X86Base_UnpackLow, simdBaseJitType, simdSize); - BlockRange().InsertAfter(LIR::LastNode(op[0], op[1]), tmp1); - LowerNode(tmp1); + node->ResetHWIntrinsicId(NI_X86Base_Insert, comp, tmp1, tmp2, idx); - tmp2 = - comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_X86Base_UnpackLow, simdBaseJitType, simdSize); - BlockRange().InsertAfter(LIR::LastNode(op[2], op[3]), tmp2); - LowerNode(tmp2); + for (N = 1; N < argCnt - 1; N++) + { + // LowerNode for NI_X86Base_Insert specially handles zeros, constants, and certain mask values + // to do the minimal number of operations and may merge together two neighboring inserts that + // don't have any side effects between them. Because of this and because of the interdependence + // of the inserts we've created above, we need to wait to lower the generated inserts until after + // we've completed the chain. - node->ResetHWIntrinsicId(NI_X86Base_MoveLowToHigh, tmp1, tmp2); + GenTree* insertedNode = insertedNodes[N]; + LowerNode(insertedNode); + } break; } @@ -5079,7 +4796,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { GenTree* op2 = node->Op(2); - if (varTypeIsLong(simdBaseType) && comp->compOpportunisticallyDependsOn(InstructionSet_SSE42_X64)) + if (varTypeIsLong(simdBaseType) && comp->compOpportunisticallyDependsOn(InstructionSet_X86Base_X64)) { // We will be constructing the following parts: // ... @@ -5096,7 +4813,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) idx = comp->gtNewIconNode(0x01, TYP_INT); BlockRange().InsertBefore(node, idx); - node->ResetHWIntrinsicId(NI_SSE42_X64_Insert, comp, tmp1, op2, idx); + node->ResetHWIntrinsicId(NI_X86Base_X64_Insert, comp, tmp1, op2, idx); break; } @@ -5526,7 +5243,7 @@ GenTree* Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) case TYP_LONG: case TYP_ULONG: { - resIntrinsic = NI_SSE42_X64_Extract; + resIntrinsic = NI_X86Base_X64_Extract; break; } @@ -5542,11 +5259,6 @@ GenTree* Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) case TYP_UBYTE: case TYP_INT: case TYP_UINT: - { - resIntrinsic = NI_SSE42_Extract; - break; - } - case TYP_SHORT: case TYP_USHORT: { @@ -5821,11 +5533,11 @@ GenTree* Lowering::LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node) case TYP_LONG: case TYP_ULONG: { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE42_X64)); + assert(comp->compIsaSupportedDebugOnly(InstructionSet_X86Base_X64)); idx = comp->gtNewIconNode(imm8); BlockRange().InsertBefore(result, idx); - result->ChangeHWIntrinsicId(NI_SSE42_X64_Insert, op1, op3, idx); + result->ChangeHWIntrinsicId(NI_X86Base_X64_Insert, op1, op3, idx); break; } @@ -5843,140 +5555,15 @@ GenTree* Lowering::LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node) tmp1 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op3, CORINFO_TYPE_FLOAT, 16); LowerNode(tmp1); - if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - if (imm8 == 0) - { - // We will be constructing the following parts: - // ... - // /--* op1 simd16 - // +--* op2 simd16 - // node = * HWINTRINSIC simd16 T MoveScalar - - // This is roughly the following managed code: - // ... - // node = Sse.MoveScalar(op1, op2); - - result->ResetHWIntrinsicId(NI_X86Base_MoveScalar, op1, tmp1); - } - else - { - // We will be constructing the following parts: - // ... - // /--* op1 simd16 - // * STORE_LCL_VAR simd16 - // op2 = LCL_VAR simd16 - // tmp2 = LCL_VAR simd16 - // idx = CNS_INT int 0 - // /--* tmp1 simd16 - // +--* tmp2 simd16 - // +--* idx int - // op1 = * HWINTRINSIC simd16 T Shuffle - // idx = CNS_INT int 226 - // /--* op1 simd16 - // +--* tmp2 simd16 - // +--* idx int - // op1 = * HWINTRINSIC simd16 T Shuffle - - // This is roughly the following managed code: - // ... - // tmp2 = Sse.Shuffle(tmp1, op1, 0 or 48 or 32); - // node = Sse.Shuffle(tmp2, op1, 226 or 132 or 36); - - result->Op(1) = op1; - LIR::Use op1Use(BlockRange(), &result->Op(1), result); - ReplaceWithLclVar(op1Use); - op2 = result->Op(1); - - tmp2 = comp->gtClone(op2); - BlockRange().InsertAfter(tmp1, tmp2); - - ssize_t controlBits1; - ssize_t controlBits2; - - // The comments beside the control bits below are listed using the managed API operands - // - // In practice, for the first step the value being inserted (op3) is in tmp1 - // while the other elements of the result (op1) are in tmp2. The result ends - // up containing the value being inserted and its immediate neighbor. - // - // The second step takes that result (which is in op1) plus the other elements - // from op2 (a clone of op1/tmp2 from the previous step) and combines them to - // create the final result. - - switch (imm8) - { - case 1: - { - controlBits1 = 0; // 00 00 00 00; op1 = { X = op3, Y = op3, Z = op1.X, W = op1.X } - controlBits2 = 226; // 11 10 00 10; node = { X = op1.X, Y = op3, Z = op1.Z, W = op1.W } - break; - } - - case 2: - { - controlBits1 = 15; // 00 00 11 11; op1 = { X = op1.W, Y = op1.W, Z = op3, W = op3 } - controlBits2 = 36; // 00 10 01 00; node = { X = op1.X, Y = op1.Y, Z = op3, W = op1.W } - break; - } - - case 3: - { - controlBits1 = 10; // 00 00 10 10; op1 = { X = op1.Z, Y = op1.Z, Z = op3, W = op3 } - controlBits2 = 132; // 10 00 01 00; node = { X = op1.X, Y = op1.Y, Z = op1.Z, W = op3 } - break; - } - - default: - unreached(); - } - - idx = comp->gtNewIconNode(controlBits1); - BlockRange().InsertAfter(tmp2, idx); - - if (imm8 != 1) - { - std::swap(tmp1, tmp2); - } - - op1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, idx, NI_X86Base_Shuffle, - CORINFO_TYPE_FLOAT, 16); - BlockRange().InsertAfter(idx, op1); - LowerNode(op1); - - idx = comp->gtNewIconNode(controlBits2); - BlockRange().InsertAfter(op1, idx); - - if (imm8 != 1) - { - std::swap(op1, op2); - } - - result->ChangeHWIntrinsicId(NI_X86Base_Shuffle, op1, op2, idx); - } - break; - } - else - { - imm8 = imm8 * 16; - op3 = tmp1; - FALLTHROUGH; - } + imm8 = imm8 * 16; + op3 = tmp1; + FALLTHROUGH; } case TYP_BYTE: case TYP_UBYTE: case TYP_INT: case TYP_UINT: - { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE42)); - - idx = comp->gtNewIconNode(imm8); - BlockRange().InsertBefore(result, idx); - result->ChangeHWIntrinsicId(NI_SSE42_Insert, op1, op3, idx); - break; - } - case TYP_SHORT: case TYP_USHORT: { @@ -6179,137 +5766,105 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) { case TYP_SHORT: case TYP_USHORT: - { - horizontalAdd = NI_SSE42_HorizontalAdd; - - if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - shuffle = NI_X86Base_ShuffleLow; - } - break; - } - case TYP_INT: case TYP_UINT: { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE42)); - horizontalAdd = NI_SSE42_HorizontalAdd; + horizontalAdd = NI_X86Base_HorizontalAdd; break; } case TYP_FLOAT: { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // We will be constructing the following parts: - // idx = CNS_INT int 0xFF - // /--* op1 simd16 - // +--* op2 simd16 - // +--* idx int - // tmp3 = * HWINTRINSIC simd16 T DotProduct - // /--* tmp3 simd16 - // node = * HWINTRINSIC simd16 T ToScalar - - // This is roughly the following managed code: - // var tmp3 = Avx.DotProduct(op1, op2, 0xFF); - // return tmp3.ToScalar(); - - if (simdSize == 8) - { - idx = comp->gtNewIconNode(0x3F, TYP_INT); - } - else if (simdSize == 12) - { - idx = comp->gtNewIconNode(0x7F, TYP_INT); - } - else - { - assert(simdSize == 16); - idx = comp->gtNewIconNode(0xFF, TYP_INT); - } - BlockRange().InsertBefore(node, idx); - - if (varTypeIsSIMD(node->gtType)) - { - // We're producing a vector result, so just emit DotProduct directly - node->ResetHWIntrinsicId(NI_SSE42_DotProduct, comp, op1, op2, idx); - } - else - { - // We're producing a scalar result, so we only need the result in element 0 - // - // However, doing that would break/limit CSE and requires a partial write so - // it's better to just broadcast the value to the entire vector + // We will be constructing the following parts: + // idx = CNS_INT int 0xFF + // /--* op1 simd16 + // +--* op2 simd16 + // +--* idx int + // tmp3 = * HWINTRINSIC simd16 T DotProduct + // /--* tmp3 simd16 + // node = * HWINTRINSIC simd16 T ToScalar - tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE42_DotProduct, - simdBaseJitType, simdSize); - BlockRange().InsertAfter(idx, tmp3); - LowerNode(tmp3); + // This is roughly the following managed code: + // var tmp3 = Avx.DotProduct(op1, op2, 0xFF); + // return tmp3.ToScalar(); - node->ResetHWIntrinsicId(NI_Vector128_ToScalar, tmp3); - } + if (simdSize == 8) + { + idx = comp->gtNewIconNode(0x3F, TYP_INT); + } + else if (simdSize == 12) + { + idx = comp->gtNewIconNode(0x7F, TYP_INT); + } + else + { + assert(simdSize == 16); + idx = comp->gtNewIconNode(0xFF, TYP_INT); + } + BlockRange().InsertBefore(node, idx); - return LowerNode(node); + if (varTypeIsSIMD(node->gtType)) + { + // We're producing a vector result, so just emit DotProduct directly + node->ResetHWIntrinsicId(NI_X86Base_DotProduct, comp, op1, op2, idx); } + else + { + // We're producing a scalar result, so we only need the result in element 0 + // + // However, doing that would break/limit CSE and requires a partial write so + // it's better to just broadcast the value to the entire vector - horizontalAdd = NI_SSE42_HorizontalAdd; + tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_X86Base_DotProduct, + simdBaseJitType, simdSize); + BlockRange().InsertAfter(idx, tmp3); + LowerNode(tmp3); - if ((simdSize == 8) || !comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // We also do this for simdSize == 8 to ensure we broadcast the result as expected - shuffle = NI_X86Base_Shuffle; + node->ResetHWIntrinsicId(NI_Vector128_ToScalar, tmp3); } - break; + + return LowerNode(node); } case TYP_DOUBLE: { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // We will be constructing the following parts: - // idx = CNS_INT int 0x31 - // /--* op1 simd16 - // +--* op2 simd16 - // +--* idx int - // tmp3 = * HWINTRINSIC simd16 T DotProduct - // /--* tmp3 simd16 - // node = * HWINTRINSIC simd16 T ToScalar - - // This is roughly the following managed code: - // var tmp3 = Avx.DotProduct(op1, op2, 0x31); - // return tmp3.ToScalar(); + // We will be constructing the following parts: + // idx = CNS_INT int 0x31 + // /--* op1 simd16 + // +--* op2 simd16 + // +--* idx int + // tmp3 = * HWINTRINSIC simd16 T DotProduct + // /--* tmp3 simd16 + // node = * HWINTRINSIC simd16 T ToScalar - idx = comp->gtNewIconNode(0x33, TYP_INT); - BlockRange().InsertBefore(node, idx); + // This is roughly the following managed code: + // var tmp3 = Avx.DotProduct(op1, op2, 0x31); + // return tmp3.ToScalar(); - if (varTypeIsSIMD(node->gtType)) - { - // We're producing a vector result, so just emit DotProduct directly - node->ResetHWIntrinsicId(NI_SSE42_DotProduct, comp, op1, op2, idx); - } - else - { - // We're producing a scalar result, so we only need the result in element 0 - // - // However, doing that would break/limit CSE and requires a partial write so - // it's better to just broadcast the value to the entire vector + idx = comp->gtNewIconNode(0x33, TYP_INT); + BlockRange().InsertBefore(node, idx); - tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_SSE42_DotProduct, - simdBaseJitType, simdSize); - BlockRange().InsertAfter(idx, tmp3); - LowerNode(tmp3); + if (varTypeIsSIMD(node->gtType)) + { + // We're producing a vector result, so just emit DotProduct directly + node->ResetHWIntrinsicId(NI_X86Base_DotProduct, comp, op1, op2, idx); + } + else + { + // We're producing a scalar result, so we only need the result in element 0 + // + // However, doing that would break/limit CSE and requires a partial write so + // it's better to just broadcast the value to the entire vector - node->ResetHWIntrinsicId(NI_Vector128_ToScalar, tmp3); - } + tmp3 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, idx, NI_X86Base_DotProduct, + simdBaseJitType, simdSize); + BlockRange().InsertAfter(idx, tmp3); + LowerNode(tmp3); - return LowerNode(node); + node->ResetHWIntrinsicId(NI_Vector128_ToScalar, tmp3); } - horizontalAdd = NI_SSE42_HorizontalAdd; - - // We need to ensure we broadcast the result as expected - shuffle = NI_X86Base_Shuffle; - break; + return LowerNode(node); } default: @@ -7819,9 +7374,7 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) if (isContainable && varTypeIsIntegral(simdBaseType)) { - isContainable = (genTypeSize(simdBaseType) == genTypeSize(node)) && - (!varTypeIsSmall(simdBaseType) || - comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)); + isContainable = (genTypeSize(simdBaseType) == genTypeSize(node)); if (isContainable && varTypeIsSmall(simdBaseType)) { @@ -7841,7 +7394,7 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) LowerNode(op1); } - intrinsicId = varTypeIsByte(node) ? NI_SSE42_Extract : NI_X86Base_Extract; + intrinsicId = NI_X86Base_Extract; GenTree* zero = comp->gtNewZeroConNode(TYP_INT); BlockRange().InsertBefore(hwintrinsic, zero); @@ -7896,11 +7449,8 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) // However, we want to prefer containing the store over allowing the // input to be regOptional, so track and clear containment if required. - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - clearContainedNode = hwintrinsic->Op(1); - isContainable = !clearContainedNode->isContained(); - } + clearContainedNode = hwintrinsic->Op(1); + isContainable = !clearContainedNode->isContained(); } else { @@ -7913,8 +7463,7 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) } case NI_X86Base_Extract: - case NI_SSE42_Extract: - case NI_SSE42_X64_Extract: + case NI_X86Base_X64_Extract: case NI_AVX_ExtractVector128: case NI_AVX2_ExtractVector128: case NI_AVX512_ExtractVector128: @@ -7927,12 +7476,6 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) isContainable = HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && lastOp->IsCnsIntOrI() && (genTypeSize(simdBaseType) == genTypeSize(node)); - - if (isContainable && (intrinsicId == NI_X86Base_Extract)) - { - // Validate the pextrw encoding supports containment - isContainable = comp->compOpportunisticallyDependsOn(InstructionSet_SSE42); - } break; } @@ -8916,9 +8459,9 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre switch (parentIntrinsicId) { - case NI_SSE42_ConvertToVector128Int16: - case NI_SSE42_ConvertToVector128Int32: - case NI_SSE42_ConvertToVector128Int64: + case NI_X86Base_ConvertToVector128Int16: + case NI_X86Base_ConvertToVector128Int32: + case NI_X86Base_ConvertToVector128Int64: case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: @@ -8986,8 +8529,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } case NI_X86Base_Insert: - case NI_SSE42_Insert: - case NI_SSE42_X64_Insert: + case NI_X86Base_X64_Insert: { // insertps op2 is xmm/m32. If xmm, the upper 2 bits of op3 (imm8) are used to select the element // position from the source vector; if m32, the source element selection bits in the imm8 are @@ -9156,7 +8698,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre // CRC32 codegen depends on its second operand's type. // Currently, we are using SIMDBaseType to store the op2Type info. - if (parentIntrinsicId == NI_SSE42_Crc32) + if (parentIntrinsicId == NI_X86Base_Crc32) { expectedSize = genTypeSize(parentBaseType); } @@ -9274,7 +8816,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre return supportsSIMDScalarLoad; } - case NI_SSE42_MoveAndDuplicate: + case NI_X86Base_MoveAndDuplicate: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512_BroadcastScalarToVector512: @@ -9291,9 +8833,9 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } // make the broadcast node containable when embedded broadcast can be enabled. - if (intrinsicId == NI_SSE42_MoveAndDuplicate) + if (intrinsicId == NI_X86Base_MoveAndDuplicate) { - // NI_SSE42_MoveAndDuplicate is for Vector128 only. + // NI_X86Base_MoveAndDuplicate is for Vector128 only. assert(childBaseType == TYP_DOUBLE); } @@ -9326,7 +8868,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre return false; } - case NI_SSE42_LoadAndDuplicateToVector128: + case NI_X86Base_LoadAndDuplicateToVector128: case NI_AVX_BroadcastScalarToVector128: case NI_AVX_BroadcastScalarToVector256: { @@ -9612,16 +9154,16 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { switch (intrinsicId) { + case NI_X86Base_CeilingScalar: + case NI_X86Base_FloorScalar: case NI_X86Base_ReciprocalScalar: case NI_X86Base_ReciprocalSqrtScalar: + case NI_X86Base_RoundCurrentDirectionScalar: + case NI_X86Base_RoundToNearestIntegerScalar: + case NI_X86Base_RoundToNegativeInfinityScalar: + case NI_X86Base_RoundToPositiveInfinityScalar: + case NI_X86Base_RoundToZeroScalar: case NI_X86Base_SqrtScalar: - case NI_SSE42_CeilingScalar: - case NI_SSE42_FloorScalar: - case NI_SSE42_RoundCurrentDirectionScalar: - case NI_SSE42_RoundToNearestIntegerScalar: - case NI_SSE42_RoundToNegativeInfinityScalar: - case NI_SSE42_RoundToPositiveInfinityScalar: - case NI_SSE42_RoundToZeroScalar: case NI_AVX512_GetExponentScalar: case NI_AVX512_Reciprocal14Scalar: case NI_AVX512_ReciprocalSqrt14Scalar: @@ -9651,9 +9193,9 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE42_ConvertToVector128Int16: - case NI_SSE42_ConvertToVector128Int32: - case NI_SSE42_ConvertToVector128Int64: + case NI_X86Base_ConvertToVector128Int16: + case NI_X86Base_ConvertToVector128Int32: + case NI_X86Base_ConvertToVector128Int64: case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: @@ -9967,6 +9509,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) switch (intrinsicId) { case NI_X86Base_Extract: + case NI_X86Base_X64_Extract: case NI_AVX_ExtractVector128: case NI_AVX2_ExtractVector128: case NI_AVX512_ExtractVector128: @@ -9992,6 +9535,15 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } case NI_X86Base_Shuffle: + { + if (varTypeIsByte(simdBaseType)) + { + TryMakeSrcContainedOrRegOptional(node, op2); + break; + } + FALLTHROUGH; + } + case NI_X86Base_ShuffleHigh: case NI_X86Base_ShuffleLow: case NI_AVX2_Permute4x64: @@ -10019,16 +9571,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE42_Extract: - case NI_SSE42_X64_Extract: - { - // These intrinsics are "ins reg/mem, xmm" and get - // contained by the relevant store operation instead. - - assert(!varTypeIsFloating(simdBaseType)); - break; - } - case NI_AVX_Permute: case NI_X86Base_ShiftLeftLogical: case NI_X86Base_ShiftRightArithmetic: @@ -10490,7 +10032,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { switch (intrinsicId) { - case NI_SSE42_BlendVariable: + case NI_X86Base_BlendVariable: case NI_AVX_BlendVariable: case NI_AVX2_BlendVariable: { @@ -10641,13 +10183,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { switch (intrinsicId) { + case NI_X86Base_AlignRight: + case NI_X86Base_Blend: + case NI_X86Base_DotProduct: + case NI_X86Base_MultipleSumAbsoluteDifferences: case NI_X86Base_Shuffle: - case NI_X86Base_Insert: - case NI_SSE42_AlignRight: - case NI_SSE42_Blend: - case NI_SSE42_DotProduct: - case NI_SSE42_X64_Insert: - case NI_SSE42_MultipleSumAbsoluteDifferences: + case NI_X86Base_X64_Insert: case NI_AVX_Blend: case NI_AVX_Compare: case NI_AVX_CompareScalar: @@ -10698,7 +10239,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE42_Insert: + case NI_X86Base_Insert: { GenTree* lastOp = node->Op(numArgs); diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 4ee0389a57a470..11ac16713e3b10 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -4164,22 +4164,6 @@ int LinearScan::BuildStoreLoc(GenTreeLclVarCommon* storeLoc) { BuildUse(op1, RBM_NONE, i); } -#if defined(FEATURE_SIMD) && defined(TARGET_X86) - if (TargetOS::IsWindows && !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - if (varTypeIsSIMD(storeLoc) && op1->IsCall()) - { - // Need an additional register to create a SIMD8 from EAX/EDX without SSE4.1. - buildInternalFloatRegisterDefForNode(storeLoc, allSIMDRegs()); - - if (isCandidateVar(varDsc)) - { - // This internal register must be different from the target register. - setInternalRegsDelayFree = true; - } - } - } -#endif // FEATURE_SIMD && TARGET_X86 } else if (op1->isContained() && op1->OperIs(GT_BITCAST)) { diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index a947e5546e58c3..0174e6b3f7f9cf 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1710,13 +1710,6 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) { simdTemp = buildInternalFloatRegisterDefForNode(putArgStk); } - - if (!compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // To store SIMD12 without extractps we will need - // a temp xmm reg to do the shuffle. - buildInternalFloatRegisterDefForNode(use.GetNode()); - } } #endif // defined(FEATURE_SIMD) @@ -2270,16 +2263,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou RefPosition* op1Use = BuildUse(op1); srcCount += 1; - - if ((baseType == TYP_FLOAT) && HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) && - !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - setDelayFree(op1Use); - } - else - { - tgtPrefUse = op1Use; - } + tgtPrefUse = op1Use; } buildUses = false; @@ -2289,12 +2273,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { dstCandidates = allByteRegs(); } - else if (varTypeIsLong(baseType) && !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // For SSE2 fallbacks, we will need a temp register to insert the upper half of a long - buildInternalFloatRegisterDefForNode(intrinsicTree); - setInternalRegsDelayFree = true; - } #endif // TARGET_X86 break; } @@ -2396,7 +2374,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou break; } - case NI_SSE42_BlendVariable: + case NI_X86Base_BlendVariable: { assert(numArgs == 3); @@ -2424,7 +2402,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou break; } - case NI_SSE42_Extract: + case NI_X86Base_Extract: { assert(!varTypeIsFloating(baseType)); @@ -2438,8 +2416,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } #ifdef TARGET_X86 - case NI_SSE42_Crc32: - case NI_SSE42_X64_Crc32: + case NI_X86Base_Crc32: + case NI_X86Base_X64_Crc32: { // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument // to the code generator. We may want to encode the overload info in another way. @@ -3107,15 +3085,6 @@ int LinearScan::BuildIndir(GenTreeIndir* indirTree) assert(!indirTree->TypeIs(TYP_STRUCT)); SingleTypeRegSet useCandidates = RBM_NONE; -#ifdef FEATURE_SIMD - if (indirTree->TypeIs(TYP_SIMD12) && indirTree->OperIs(GT_STOREIND) && - !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42) && !indirTree->Data()->IsVectorZero()) - { - // GT_STOREIND needs an internal register so the upper 4 bytes can be extracted - buildInternalFloatRegisterDefForNode(indirTree); - } -#endif // FEATURE_SIMD - #ifdef TARGET_AMD64 if (varTypeUsesIntReg(indirTree->Addr())) { diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index 6857379c8b65bf..2ae7bef6c5c42a 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -421,14 +421,9 @@ void Rationalizer::RewriteHWIntrinsicAsUserCall(GenTree** use, ArrayStackcompOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // We want to keep this as is, because we'll rewrite it in post-order - return; - } - break; + return; } #endif // TARGET_XARCH @@ -698,7 +693,7 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac } else { - intrinsic = NI_SSE42_BlendVariable; + intrinsic = NI_X86Base_BlendVariable; } if (HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsic) && varTypeIsSmall(simdBaseType)) @@ -917,10 +912,6 @@ void Rationalizer::RewriteHWIntrinsicToNonMask(GenTree** use, Compiler::GenTreeS intrinsic = NI_AVX_CompareEqual; } } - else if (varTypeIsLong(simdBaseType)) - { - intrinsic = NI_SSE42_CompareEqual; - } else { intrinsic = NI_X86Base_CompareEqual; @@ -941,10 +932,6 @@ void Rationalizer::RewriteHWIntrinsicToNonMask(GenTree** use, Compiler::GenTreeS intrinsic = NI_AVX_CompareGreaterThan; } } - else if (varTypeIsLong(simdBaseType)) - { - intrinsic = NI_SSE42_CompareGreaterThan; - } else { intrinsic = NI_X86Base_CompareGreaterThan; @@ -978,10 +965,6 @@ void Rationalizer::RewriteHWIntrinsicToNonMask(GenTree** use, Compiler::GenTreeS intrinsic = NI_AVX_CompareLessThan; } } - else if (varTypeIsLong(simdBaseType)) - { - intrinsic = NI_SSE42_CompareLessThan; - } else { intrinsic = NI_X86Base_CompareLessThan; @@ -1539,9 +1522,6 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree parents.Push(castNode); } #elif defined(TARGET_XARCH) - NamedIntrinsic moveMaskIntrinsic = NI_Illegal; - NamedIntrinsic shuffleIntrinsic = NI_Illegal; - simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; // We want to tightly pack the most significant byte of each short/ushort @@ -1554,6 +1534,8 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree simdVal.u64[0] = 0x0F0D0B0907050301; simdVal.u64[1] = 0x8080808080808080; + NamedIntrinsic shuffleIntrinsic = NI_Illegal; + if (simdSize == 32) { // Vector256 works on 2x128-bit lanes, so repeat the same indices for the upper lane @@ -1561,15 +1543,11 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree simdVal.u64[2] = 0x0F0D0B0907050301; simdVal.u64[3] = 0x8080808080808080; - shuffleIntrinsic = NI_AVX2_Shuffle; - moveMaskIntrinsic = NI_X86Base_MoveMask; + shuffleIntrinsic = NI_AVX2_Shuffle; } else { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE42)); - - shuffleIntrinsic = NI_SSE42_Shuffle; - moveMaskIntrinsic = NI_X86Base_MoveMask; + shuffleIntrinsic = NI_X86Base_Shuffle; } GenTree* op2 = comp->gtNewVconNode(simdType); @@ -1606,7 +1584,7 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree simdSize = 16; } - node->ChangeHWIntrinsicId(moveMaskIntrinsic); + node->ChangeHWIntrinsicId(NI_X86Base_MoveMask); node->SetSimdSize(simdSize); node->SetSimdBaseJitType(simdBaseJitType); node->Op(1) = op1; diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index 9ed2774ae2f260..bb62ba71bdecb5 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -88,23 +88,12 @@ void CodeGen::genStoreIndTypeSimd12(GenTreeStoreInd* treeNode) // Store upper 4 bytes emit->emitInsStoreInd(INS_movss, EA_4BYTE, treeNode); } - else if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) + else { // Extract and store upper 4 bytes GenTreeStoreInd storeInd = storeIndirForm(TYP_SIMD16, addr, data); emit->emitIns_A_R_I(INS_extractps, EA_16BYTE, &storeInd, dataReg, 2); } - else - { - regNumber tmpReg = internalRegisters.GetSingle(treeNode); - - // Extract upper 4 bytes from data - emit->emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg); - data->SetRegNum(tmpReg); - - // Store upper 4 bytes - emit->emitInsStoreInd(INS_movss, EA_4BYTE, treeNode); - } } //----------------------------------------------------------------------------- @@ -133,15 +122,11 @@ void CodeGen::genLoadIndTypeSimd12(GenTreeIndir* treeNode) return; } - emitter* emit = GetEmitter(); - regNumber tgtReg = treeNode->GetRegNum(); - bool useSse42 = compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42); + emitter* emit = GetEmitter(); + regNumber tgtReg = treeNode->GetRegNum(); - if (useSse42) - { - // Load lower 8 bytes - emit->emitInsLoadInd(INS_movsd_simd, EA_8BYTE, tgtReg, treeNode); - } + // Load lower 8 bytes + emit->emitInsLoadInd(INS_movsd_simd, EA_8BYTE, tgtReg, treeNode); // Update the addr node to offset by 8 @@ -164,41 +149,9 @@ void CodeGen::genLoadIndTypeSimd12(GenTreeIndir* treeNode) treeNode->Addr() = addr; - if (useSse42) - { - // Load and insert upper 4 bytes, 0x20 inserts to index 2 and 0x8 zeros index 3 - GenTreeIndir indir = indirForm(TYP_SIMD16, addr); - emit->emitIns_SIMD_R_R_A_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, &indir, 0x28, INS_OPTS_NONE); - } - else - { - // Load upper 4 bytes to lower half of tgtReg - emit->emitInsLoadInd(INS_movss, EA_4BYTE, tgtReg, treeNode); - - // Move upper 4 bytes to upper half of tgtReg - emit->emitIns_R_R(INS_movlhps, EA_16BYTE, tgtReg, tgtReg); - - // Revert the addr node to the original offset - // Doing it this way saves us a register and produces smaller code - - if (treeNode->isIndirAddrMode()) - { - GenTreeAddrMode* addrMode = addr->AsAddrMode(); - addrMode->SetOffset(addrMode->Offset() - 8); - } - else if (addr->IsCnsIntOrI() && addr->isContained()) - { - GenTreeIntConCommon* icon = addr->AsIntConCommon(); - icon->SetIconValue(icon->IconValue() - 8); - } - else - { - unreached(); - } - - // Load lower 8 bytes into tgtReg, preserving upper 4 bytes - emit->emitInsLoadInd(INS_movlps, EA_16BYTE, tgtReg, treeNode); - } + // Load and insert upper 4 bytes, 0x20 inserts to index 2 and 0x8 zeros index 3 + GenTreeIndir indir = indirForm(TYP_SIMD16, addr); + emit->emitIns_SIMD_R_R_A_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, &indir, 0x28, INS_OPTS_NONE); genProduceReg(treeNode); } @@ -288,21 +241,11 @@ void CodeGen::genEmitStoreLclTypeSimd12(GenTree* store, unsigned lclNum, unsigne // Store upper 4 bytes emit->emitIns_S_R(INS_movss, EA_4BYTE, dataReg, lclNum, offset + 8); } - else if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) + else { // Extract and store upper 4 bytes emit->emitIns_S_R_I(INS_extractps, EA_16BYTE, lclNum, offset + 8, dataReg, 2); } - else - { - regNumber tmpReg = internalRegisters.GetSingle(store); - - // Extract upper 4 bytes from data - emit->emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg); - - // Store upper 4 bytes - emit->emitIns_S_R(INS_movss, EA_4BYTE, tmpReg, lclNum, offset + 8); - } } //------------------------------------------------------------------------ @@ -317,25 +260,11 @@ void CodeGen::genEmitLoadLclTypeSimd12(regNumber tgtReg, unsigned lclNum, unsign { emitter* emit = GetEmitter(); - if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // Load lower 8 bytes into tgtReg, preserving upper 4 bytes - emit->emitIns_R_S(INS_movsd_simd, EA_8BYTE, tgtReg, lclNum, offset); - - // Load and insert upper 4 byte, 0x20 inserts to index 2 and 0x8 zeros index 3 - emit->emitIns_SIMD_R_R_S_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, lclNum, offset + 8, 0x28, INS_OPTS_NONE); - } - else - { - // Load upper 4 bytes to lower half of tgtReg - emit->emitIns_R_S(INS_movss, EA_4BYTE, tgtReg, lclNum, offset + 8); - - // Move upper 4 bytes to upper half of tgtReg - emit->emitIns_R_R(INS_movlhps, EA_16BYTE, tgtReg, tgtReg); + // Load lower 8 bytes into tgtReg, preserving upper 4 bytes + emit->emitIns_R_S(INS_movsd_simd, EA_8BYTE, tgtReg, lclNum, offset); - // Load lower 8 bytes into tgtReg, preserving upper 4 bytes - emit->emitIns_R_S(INS_movlps, EA_16BYTE, tgtReg, lclNum, offset); - } + // Load and insert upper 4 byte, 0x20 inserts to index 2 and 0x8 zeros index 3 + emit->emitIns_SIMD_R_R_S_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, lclNum, offset + 8, 0x28, INS_OPTS_NONE); } #ifdef TARGET_X86 @@ -524,26 +453,12 @@ void CodeGen::genSimd12UpperClear(regNumber tgtReg) { assert(genIsValidFloatReg(tgtReg)); - if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42)) - { - // ZMASK: 0b1000 - Preserve element 0, 1, and 2; Zero element 3 - // COUNT_D: 0b11 - Insert into element 3 - // COUNT_S: 0b11 - Insert from element 3 + // ZMASK: 0b1000 - Preserve element 0, 1, and 2; Zero element 3 + // COUNT_D: 0b11 - Insert into element 3 + // COUNT_S: 0b11 - Insert from element 3 - GetEmitter()->emitIns_SIMD_R_R_R_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, tgtReg, static_cast(0xF8), - INS_OPTS_NONE); - } - else - { - // Preserve element 0, 1, and 2; Zero element 3 - simd16_t constValue; - constValue.u32[0] = 0xFFFFFFFF; - constValue.u32[1] = 0xFFFFFFFF; - constValue.u32[2] = 0xFFFFFFFF; - constValue.u32[3] = 0x00000000; - CORINFO_FIELD_HANDLE zroSimd12Elm3 = GetEmitter()->emitSimd16Const(constValue); - GetEmitter()->emitIns_SIMD_R_R_C(INS_andps, EA_16BYTE, tgtReg, tgtReg, zroSimd12Elm3, 0, INS_OPTS_NONE); - } + GetEmitter()->emitIns_SIMD_R_R_R_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, tgtReg, static_cast(0xF8), + INS_OPTS_NONE); } #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 01f0d4f0509a42..370601501f1763 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -6702,8 +6702,8 @@ bool ValueNumStore::IsVNNeverNegative(ValueNum vn) case VNF_MDArrLowerBound: #ifdef FEATURE_HW_INTRINSICS #ifdef TARGET_XARCH - case VNF_HWI_SSE42_PopCount: - case VNF_HWI_SSE42_X64_PopCount: + case VNF_HWI_X86Base_PopCount: + case VNF_HWI_X86Base_X64_PopCount: case VNF_HWI_AVX2_LeadingZeroCount: case VNF_HWI_AVX2_TrailingZeroCount: case VNF_HWI_AVX2_X64_LeadingZeroCount: @@ -8084,7 +8084,7 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, return VNForLongCon(static_cast(result)); } - case NI_SSE42_PopCount: + case NI_X86Base_PopCount: { assert(!varTypeIsSmall(type) && !varTypeIsLong(type)); @@ -8094,7 +8094,7 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree, return VNForIntCon(static_cast(result)); } - case NI_SSE42_X64_PopCount: + case NI_X86Base_X64_PopCount: { assert(varTypeIsLong(type)); diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp index c80ae2069abe48..30b3c8254e4b78 100644 --- a/src/coreclr/nativeaot/Runtime/startup.cpp +++ b/src/coreclr/nativeaot/Runtime/startup.cpp @@ -176,13 +176,32 @@ static bool InitDLL(HANDLE hPalInstance) bool DetectCPUFeatures() { #if defined(HOST_X86) || defined(HOST_AMD64) || defined(HOST_ARM64) - g_cpuFeatures = minipal_getcpufeatures(); + int cpuFeatures = minipal_getcpufeatures(); - if ((g_cpuFeatures & g_requiredCpuFeatures) != g_requiredCpuFeatures) + if ((cpuFeatures & IntrinsicConstants_Invalid) != 0) { - PalPrintFatalError("\nThe required instruction sets are not supported by the current CPU.\n"); +#if defined(HOST_X86) || defined(HOST_AMD64) + PalPrintFatalError("\nThe current CPU is missing one or more of the following instruction sets: SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT\n"); +#elif defined(HOST_ARM64) && (defined(HOST_WINDOWS) || defined(HOST_APPLE)) + PalPrintFatalError("\nThe current CPU is missing one or more of the following instruction sets: AdvSimd, LSE\n"); +#elif defined(HOST_ARM64) + PalPrintFatalError("\nThe current CPU is missing one or more of the following instruction sets: AdvSimd\n"); +#else + PalPrintFatalError("\nThe current CPU is missing one or more of the baseline instruction sets.\n"); +#endif + RhFailFast(); } + + int missingCpuFeatures = g_requiredCpuFeatures & ~cpuFeatures; + + if (missingCpuFeatures != 0) + { + PalPrintFatalError("\nThe current CPU is missing one or more of the required instruction sets.\n"); + RhFailFast(); + } + + g_cpuFeatures = cpuFeatures; #endif // HOST_X86|| HOST_AMD64 || HOST_ARM64 return true; diff --git a/src/coreclr/tools/Common/Compiler/HardwareIntrinsicHelpers.cs b/src/coreclr/tools/Common/Compiler/HardwareIntrinsicHelpers.cs index 9c8da6eac983f3..5280bd5b0d1578 100644 --- a/src/coreclr/tools/Common/Compiler/HardwareIntrinsicHelpers.cs +++ b/src/coreclr/tools/Common/Compiler/HardwareIntrinsicHelpers.cs @@ -60,33 +60,28 @@ public static void AddRuntimeRequiredIsaFlagsToBuilder(InstructionSetSupportBuil // Keep these enumerations in sync with cpufeatures.h in the minipal. private static class XArchIntrinsicConstants { - // SSE and SSE2 are baseline ISAs - they're always available - public const int Sse42 = (1 << 0); - public const int Avx = (1 << 1); - public const int Avx2 = (1 << 2); - public const int Avx512 = (1 << 3); - - public const int Avx512v2 = (1 << 4); - public const int Avx512v3 = (1 << 5); - public const int Avx10v1 = (1 << 6); - public const int Avx10v2 = (1 << 7); - public const int Apx = (1 << 8); - - public const int Aes = (1 << 9); - public const int Avx512Vp2intersect = (1 << 10); - public const int AvxIfma = (1 << 11); - public const int AvxVnni = (1 << 12); + // SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and POPCNT are baseline ISAs - they're always available + public const int Avx = (1 << 0); + public const int Avx2 = (1 << 1); + public const int Avx512 = (1 << 2); + public const int Avx512v2 = (1 << 3); + public const int Avx512v3 = (1 << 4); + public const int Avx10v1 = (1 << 5); + public const int Avx10v2 = (1 << 6); + public const int Apx = (1 << 7); + public const int Aes = (1 << 8); + public const int Avx512Vp2intersect = (1 << 9); + public const int AvxIfma = (1 << 10); + public const int AvxVnni = (1 << 11); + public const int AvxVnniInt = (1 << 12); public const int Gfni = (1 << 13); public const int Sha = (1 << 14); public const int Vaes = (1 << 15); public const int WaitPkg = (1 << 16); public const int X86Serialize = (1 << 17); - public const int AvxVnniInt = (1 << 18); public static void AddToBuilder(InstructionSetSupportBuilder builder, int flags) { - if ((flags & Sse42) != 0) - builder.AddSupportedInstructionSet("sse42"); if ((flags & Avx) != 0) builder.AddSupportedInstructionSet("avx"); if ((flags & Avx2) != 0) @@ -144,15 +139,11 @@ public static void AddToBuilder(InstructionSetSupportBuilder builder, int flags) public static int FromInstructionSet(InstructionSet instructionSet) { Debug.Assert(InstructionSet.X64_AES == InstructionSet.X86_AES); - Debug.Assert(InstructionSet.X64_SSE42 == InstructionSet.X86_SSE42); Debug.Assert(InstructionSet.X64_AVX2 == InstructionSet.X86_AVX2); return instructionSet switch { // Optional ISAs - only available via opt-in or opportunistic light-up - InstructionSet.X64_SSE42 => Sse42, - InstructionSet.X64_SSE42_X64 => Sse42, - InstructionSet.X64_AVX => Avx, InstructionSet.X64_AVX_X64 => Avx, diff --git a/src/coreclr/tools/Common/InstructionSetHelpers.cs b/src/coreclr/tools/Common/InstructionSetHelpers.cs index 0fb2dd0f5c8c81..25a155eb4ca9f6 100644 --- a/src/coreclr/tools/Common/InstructionSetHelpers.cs +++ b/src/coreclr/tools/Common/InstructionSetHelpers.cs @@ -17,25 +17,63 @@ namespace System.CommandLine internal static partial class Helpers { public static InstructionSetSupport ConfigureInstructionSetSupport(string instructionSet, int maxVectorTBitWidth, bool isVectorTOptimistic, TargetArchitecture targetArchitecture, TargetOS targetOS, - string mustNotBeMessage, string invalidImplicationMessage, Logger logger, bool optimizingForSize = false) + string mustNotBeMessage, string invalidImplicationMessage, Logger logger, bool optimizingForSize, bool isReadyToRun) { InstructionSetSupportBuilder instructionSetSupportBuilder = new(targetArchitecture); - // Ready to run images are built with certain instruction set baselines + // Images are built with certain instruction set baselines + // + // For NativeAOT, this represents the minimum hardware required to run. + // Older hardware will not work + // + // For ReadyToRun, this represents the presumed majority hardware. + // Older hardware (down to the NAOT baseline) will still work, but may have more jitting on startup + if ((targetArchitecture == TargetArchitecture.X86) || (targetArchitecture == TargetArchitecture.X64)) { - instructionSetSupportBuilder.AddSupportedInstructionSet("base"); + if (isReadyToRun && (targetOS != TargetOS.OSX)) + { + // ReadyToRun can presume AVX2, BMI1, BMI2, F16C, FMA, LZCNT, and MOVBE + instructionSetSupportBuilder.AddSupportedInstructionSet("x86-64-v3"); + } + else + { + // Otherwise, we require SSE4.2 and POPCNT + instructionSetSupportBuilder.AddSupportedInstructionSet("x86-64-v2"); + } } else if (targetArchitecture == TargetArchitecture.ARM64) { if (targetOS == TargetOS.OSX) { - // For osx-arm64 we know that apple-m1 is a baseline + // For osx-arm64 we know that apple-m1 is the baseline instructionSetSupportBuilder.AddSupportedInstructionSet("apple-m1"); } + else if (isReadyToRun) + { + if (targetOS == TargetOS.Windows) + { + // ReadyToRun on Windows can presume armv8.2-a and RCPC + instructionSetSupportBuilder.AddSupportedInstructionSet("armv8.2-a"); + instructionSetSupportBuilder.AddSupportedInstructionSet("rcpc"); + } + else + { + // While Unix needs a lower baseline due to things like Raspberry PI + instructionSetSupportBuilder.AddSupportedInstructionSet("armv8-a"); + instructionSetSupportBuilder.AddSupportedInstructionSet("lse"); + } + } else { - instructionSetSupportBuilder.AddSupportedInstructionSet("neon"); + // We require armv8-a everywhere + instructionSetSupportBuilder.AddSupportedInstructionSet("armv8-a"); + + if (targetOS == TargetOS.Windows) + { + // However, Windows also requires LSE + instructionSetSupportBuilder.AddSupportedInstructionSet("lse"); + } } } @@ -187,7 +225,6 @@ public static InstructionSetSupport ConfigureInstructionSetSupport(string instru // Note that we do not indicate support for AVX, or any other instruction set which uses the VEX encodings as // the presence of those makes otherwise acceptable code be unusable on hardware which does not support VEX encodings. // - optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sse42"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("aes"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("gfni"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha"); @@ -234,11 +271,13 @@ public static InstructionSetSupport ConfigureInstructionSetSupport(string instru { optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("aes"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("crc"); - optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha1"); - optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha2"); - optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("lse"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("dotprod"); + optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("lse"); + optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("rcpc"); + optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("rcpc2"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("rdma"); + optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha1"); + optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha2"); } // Vector can always be part of the optimistic set, we only want to optionally exclude it from the supported set diff --git a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs index ef3c8b4d7e6b65..7030699c50b89e 100644 --- a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs +++ b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs @@ -73,8 +73,6 @@ public static class ReadyToRunInstructionSetHelper { case InstructionSet.X64_X86Base: return ReadyToRunInstructionSet.X86Base; case InstructionSet.X64_X86Base_X64: return ReadyToRunInstructionSet.X86Base; - case InstructionSet.X64_SSE42: return ReadyToRunInstructionSet.Sse42; - case InstructionSet.X64_SSE42_X64: return ReadyToRunInstructionSet.Sse42; case InstructionSet.X64_AVX: return ReadyToRunInstructionSet.Avx; case InstructionSet.X64_AVX_X64: return ReadyToRunInstructionSet.Avx; case InstructionSet.X64_AVX2: return ReadyToRunInstructionSet.Avx2; @@ -129,8 +127,6 @@ public static class ReadyToRunInstructionSetHelper { case InstructionSet.X86_X86Base: return ReadyToRunInstructionSet.X86Base; case InstructionSet.X86_X86Base_X64: return null; - case InstructionSet.X86_SSE42: return ReadyToRunInstructionSet.Sse42; - case InstructionSet.X86_SSE42_X64: return null; case InstructionSet.X86_AVX: return ReadyToRunInstructionSet.Avx; case InstructionSet.X86_AVX_X64: return null; case InstructionSet.X86_AVX2: return ReadyToRunInstructionSet.Avx2; diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs b/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs index 0e3e406f989c8b..8a24639960115d 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs @@ -47,7 +47,6 @@ public enum InstructionSet RiscV64_Zba = InstructionSet_RiscV64.Zba, RiscV64_Zbb = InstructionSet_RiscV64.Zbb, X64_X86Base = InstructionSet_X64.X86Base, - X64_SSE42 = InstructionSet_X64.SSE42, X64_AVX = InstructionSet_X64.AVX, X64_AVX2 = InstructionSet_X64.AVX2, X64_AVX512 = InstructionSet_X64.AVX512, @@ -77,7 +76,6 @@ public enum InstructionSet X64_AVXVNNIINT = InstructionSet_X64.AVXVNNIINT, X64_AVXVNNIINT_V512 = InstructionSet_X64.AVXVNNIINT_V512, X64_X86Base_X64 = InstructionSet_X64.X86Base_X64, - X64_SSE42_X64 = InstructionSet_X64.SSE42_X64, X64_AVX_X64 = InstructionSet_X64.AVX_X64, X64_AVX2_X64 = InstructionSet_X64.AVX2_X64, X64_AVX512_X64 = InstructionSet_X64.AVX512_X64, @@ -94,7 +92,6 @@ public enum InstructionSet X64_WAITPKG_X64 = InstructionSet_X64.WAITPKG_X64, X64_X86Serialize_X64 = InstructionSet_X64.X86Serialize_X64, X86_X86Base = InstructionSet_X86.X86Base, - X86_SSE42 = InstructionSet_X86.SSE42, X86_AVX = InstructionSet_X86.AVX, X86_AVX2 = InstructionSet_X86.AVX2, X86_AVX512 = InstructionSet_X86.AVX512, @@ -124,7 +121,6 @@ public enum InstructionSet X86_AVXVNNIINT = InstructionSet_X86.AVXVNNIINT, X86_AVXVNNIINT_V512 = InstructionSet_X86.AVXVNNIINT_V512, X86_X86Base_X64 = InstructionSet_X86.X86Base_X64, - X86_SSE42_X64 = InstructionSet_X86.SSE42_X64, X86_AVX_X64 = InstructionSet_X86.AVX_X64, X86_AVX2_X64 = InstructionSet_X86.AVX2_X64, X86_AVX512_X64 = InstructionSet_X86.AVX512_X64, @@ -188,52 +184,50 @@ public enum InstructionSet_X64 ILLEGAL = InstructionSet.ILLEGAL, NONE = InstructionSet.NONE, X86Base = 1, - SSE42 = 2, - AVX = 3, - AVX2 = 4, - AVX512 = 5, - AVX512v2 = 6, - AVX512v3 = 7, - AVX10v1 = 8, - AVX10v2 = 9, - APX = 10, - AES = 11, - AES_V256 = 12, - AES_V512 = 13, - AVX512VP2INTERSECT = 14, - AVXIFMA = 15, - AVXVNNI = 16, - GFNI = 17, - GFNI_V256 = 18, - GFNI_V512 = 19, - SHA = 20, - WAITPKG = 21, - X86Serialize = 22, - Vector128 = 23, - Vector256 = 24, - Vector512 = 25, - VectorT128 = 26, - VectorT256 = 27, - VectorT512 = 28, - AVXVNNIINT = 29, - AVXVNNIINT_V512 = 30, - X86Base_X64 = 31, - SSE42_X64 = 32, - AVX_X64 = 33, - AVX2_X64 = 34, - AVX512_X64 = 35, - AVX512v2_X64 = 36, - AVX512v3_X64 = 37, - AVX10v1_X64 = 38, - AVX10v2_X64 = 39, - AES_X64 = 40, - AVX512VP2INTERSECT_X64 = 41, - AVXIFMA_X64 = 42, - AVXVNNI_X64 = 43, - GFNI_X64 = 44, - SHA_X64 = 45, - WAITPKG_X64 = 46, - X86Serialize_X64 = 47, + AVX = 2, + AVX2 = 3, + AVX512 = 4, + AVX512v2 = 5, + AVX512v3 = 6, + AVX10v1 = 7, + AVX10v2 = 8, + APX = 9, + AES = 10, + AES_V256 = 11, + AES_V512 = 12, + AVX512VP2INTERSECT = 13, + AVXIFMA = 14, + AVXVNNI = 15, + GFNI = 16, + GFNI_V256 = 17, + GFNI_V512 = 18, + SHA = 19, + WAITPKG = 20, + X86Serialize = 21, + Vector128 = 22, + Vector256 = 23, + Vector512 = 24, + VectorT128 = 25, + VectorT256 = 26, + VectorT512 = 27, + AVXVNNIINT = 28, + AVXVNNIINT_V512 = 29, + X86Base_X64 = 30, + AVX_X64 = 31, + AVX2_X64 = 32, + AVX512_X64 = 33, + AVX512v2_X64 = 34, + AVX512v3_X64 = 35, + AVX10v1_X64 = 36, + AVX10v2_X64 = 37, + AES_X64 = 38, + AVX512VP2INTERSECT_X64 = 39, + AVXIFMA_X64 = 40, + AVXVNNI_X64 = 41, + GFNI_X64 = 42, + SHA_X64 = 43, + WAITPKG_X64 = 44, + X86Serialize_X64 = 45, } public enum InstructionSet_X86 @@ -241,52 +235,50 @@ public enum InstructionSet_X86 ILLEGAL = InstructionSet.ILLEGAL, NONE = InstructionSet.NONE, X86Base = 1, - SSE42 = 2, - AVX = 3, - AVX2 = 4, - AVX512 = 5, - AVX512v2 = 6, - AVX512v3 = 7, - AVX10v1 = 8, - AVX10v2 = 9, - APX = 10, - AES = 11, - AES_V256 = 12, - AES_V512 = 13, - AVX512VP2INTERSECT = 14, - AVXIFMA = 15, - AVXVNNI = 16, - GFNI = 17, - GFNI_V256 = 18, - GFNI_V512 = 19, - SHA = 20, - WAITPKG = 21, - X86Serialize = 22, - Vector128 = 23, - Vector256 = 24, - Vector512 = 25, - VectorT128 = 26, - VectorT256 = 27, - VectorT512 = 28, - AVXVNNIINT = 29, - AVXVNNIINT_V512 = 30, - X86Base_X64 = 31, - SSE42_X64 = 32, - AVX_X64 = 33, - AVX2_X64 = 34, - AVX512_X64 = 35, - AVX512v2_X64 = 36, - AVX512v3_X64 = 37, - AVX10v1_X64 = 38, - AVX10v2_X64 = 39, - AES_X64 = 40, - AVX512VP2INTERSECT_X64 = 41, - AVXIFMA_X64 = 42, - AVXVNNI_X64 = 43, - GFNI_X64 = 44, - SHA_X64 = 45, - WAITPKG_X64 = 46, - X86Serialize_X64 = 47, + AVX = 2, + AVX2 = 3, + AVX512 = 4, + AVX512v2 = 5, + AVX512v3 = 6, + AVX10v1 = 7, + AVX10v2 = 8, + APX = 9, + AES = 10, + AES_V256 = 11, + AES_V512 = 12, + AVX512VP2INTERSECT = 13, + AVXIFMA = 14, + AVXVNNI = 15, + GFNI = 16, + GFNI_V256 = 17, + GFNI_V512 = 18, + SHA = 19, + WAITPKG = 20, + X86Serialize = 21, + Vector128 = 22, + Vector256 = 23, + Vector512 = 24, + VectorT128 = 25, + VectorT256 = 26, + VectorT512 = 27, + AVXVNNIINT = 28, + AVXVNNIINT_V512 = 29, + X86Base_X64 = 30, + AVX_X64 = 31, + AVX2_X64 = 32, + AVX512_X64 = 33, + AVX512v2_X64 = 34, + AVX512v3_X64 = 35, + AVX10v1_X64 = 36, + AVX10v2_X64 = 37, + AES_X64 = 38, + AVX512VP2INTERSECT_X64 = 39, + AVXIFMA_X64 = 40, + AVXVNNI_X64 = 41, + GFNI_X64 = 42, + SHA_X64 = 43, + WAITPKG_X64 = 44, + X86Serialize_X64 = 45, } public unsafe struct InstructionSetFlags : IEnumerable @@ -525,10 +517,6 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target resultflags.AddInstructionSet(InstructionSet.X64_X86Base_X64); if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base_X64)) resultflags.AddInstructionSet(InstructionSet.X64_X86Base); - if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42)) - resultflags.AddInstructionSet(InstructionSet.X64_SSE42_X64); - if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42_X64)) - resultflags.AddInstructionSet(InstructionSet.X64_SSE42); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX)) resultflags.AddInstructionSet(InstructionSet.X64_AVX_X64); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX_X64)) @@ -589,10 +577,8 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target resultflags.AddInstructionSet(InstructionSet.X64_X86Serialize_X64); if (resultflags.HasInstructionSet(InstructionSet.X64_X86Serialize_X64)) resultflags.AddInstructionSet(InstructionSet.X64_X86Serialize); - if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42)) - resultflags.AddInstructionSet(InstructionSet.X64_X86Base); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX)) - resultflags.AddInstructionSet(InstructionSet.X64_SSE42); + resultflags.AddInstructionSet(InstructionSet.X64_X86Base); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX2)) resultflags.AddInstructionSet(InstructionSet.X64_AVX); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX512)) @@ -622,7 +608,7 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target if (resultflags.HasInstructionSet(InstructionSet.X64_AVXVNNI)) resultflags.AddInstructionSet(InstructionSet.X64_AVX2); if (resultflags.HasInstructionSet(InstructionSet.X64_GFNI)) - resultflags.AddInstructionSet(InstructionSet.X64_SSE42); + resultflags.AddInstructionSet(InstructionSet.X64_X86Base); if (resultflags.HasInstructionSet(InstructionSet.X64_GFNI_V256)) resultflags.AddInstructionSet(InstructionSet.X64_GFNI); if (resultflags.HasInstructionSet(InstructionSet.X64_GFNI_V256)) @@ -656,10 +642,8 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target break; case TargetArchitecture.X86: - if (resultflags.HasInstructionSet(InstructionSet.X86_SSE42)) - resultflags.AddInstructionSet(InstructionSet.X86_X86Base); if (resultflags.HasInstructionSet(InstructionSet.X86_AVX)) - resultflags.AddInstructionSet(InstructionSet.X86_SSE42); + resultflags.AddInstructionSet(InstructionSet.X86_X86Base); if (resultflags.HasInstructionSet(InstructionSet.X86_AVX2)) resultflags.AddInstructionSet(InstructionSet.X86_AVX); if (resultflags.HasInstructionSet(InstructionSet.X86_AVX512)) @@ -689,7 +673,7 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target if (resultflags.HasInstructionSet(InstructionSet.X86_AVXVNNI)) resultflags.AddInstructionSet(InstructionSet.X86_AVX2); if (resultflags.HasInstructionSet(InstructionSet.X86_GFNI)) - resultflags.AddInstructionSet(InstructionSet.X86_SSE42); + resultflags.AddInstructionSet(InstructionSet.X86_X86Base); if (resultflags.HasInstructionSet(InstructionSet.X86_GFNI_V256)) resultflags.AddInstructionSet(InstructionSet.X86_GFNI); if (resultflags.HasInstructionSet(InstructionSet.X86_GFNI_V256)) @@ -799,8 +783,6 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe case TargetArchitecture.X64: if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base_X64)) resultflags.AddInstructionSet(InstructionSet.X64_X86Base); - if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42_X64)) - resultflags.AddInstructionSet(InstructionSet.X64_SSE42); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX_X64)) resultflags.AddInstructionSet(InstructionSet.X64_AVX); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX2_X64)) @@ -832,8 +814,6 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe if (resultflags.HasInstructionSet(InstructionSet.X64_X86Serialize_X64)) resultflags.AddInstructionSet(InstructionSet.X64_X86Serialize); if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base)) - resultflags.AddInstructionSet(InstructionSet.X64_SSE42); - if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42)) resultflags.AddInstructionSet(InstructionSet.X64_AVX); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX)) resultflags.AddInstructionSet(InstructionSet.X64_AVX2); @@ -863,7 +843,7 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe resultflags.AddInstructionSet(InstructionSet.X64_AVXIFMA); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX2)) resultflags.AddInstructionSet(InstructionSet.X64_AVXVNNI); - if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42)) + if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base)) resultflags.AddInstructionSet(InstructionSet.X64_GFNI); if (resultflags.HasInstructionSet(InstructionSet.X64_GFNI)) resultflags.AddInstructionSet(InstructionSet.X64_GFNI_V256); @@ -899,8 +879,6 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe case TargetArchitecture.X86: if (resultflags.HasInstructionSet(InstructionSet.X86_X86Base)) - resultflags.AddInstructionSet(InstructionSet.X86_SSE42); - if (resultflags.HasInstructionSet(InstructionSet.X86_SSE42)) resultflags.AddInstructionSet(InstructionSet.X86_AVX); if (resultflags.HasInstructionSet(InstructionSet.X86_AVX)) resultflags.AddInstructionSet(InstructionSet.X86_AVX2); @@ -930,7 +908,7 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe resultflags.AddInstructionSet(InstructionSet.X86_AVXIFMA); if (resultflags.HasInstructionSet(InstructionSet.X86_AVX2)) resultflags.AddInstructionSet(InstructionSet.X86_AVXVNNI); - if (resultflags.HasInstructionSet(InstructionSet.X86_SSE42)) + if (resultflags.HasInstructionSet(InstructionSet.X86_X86Base)) resultflags.AddInstructionSet(InstructionSet.X86_GFNI); if (resultflags.HasInstructionSet(InstructionSet.X86_GFNI)) resultflags.AddInstructionSet(InstructionSet.X86_GFNI_V256); @@ -971,10 +949,8 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe private static Dictionary<(string, TargetArchitecture), string> AllInstructionSetGroups { get; } = new() { - { ("x86-64", TargetArchitecture.X64), "base" }, - { ("x86-64", TargetArchitecture.X86), "base" }, - { ("x86-64-v2", TargetArchitecture.X64), "x86-64 sse4.2" }, - { ("x86-64-v2", TargetArchitecture.X86), "x86-64 sse4.2" }, + { ("x86-64-v2", TargetArchitecture.X64), "base" }, + { ("x86-64-v2", TargetArchitecture.X86), "base" }, { ("x86-64-v3", TargetArchitecture.X64), "x86-64-v2 avx2" }, { ("x86-64-v3", TargetArchitecture.X86), "x86-64-v2 avx2" }, { ("x86-64-v4", TargetArchitecture.X64), "x86-64-v3 avx512" }, @@ -1046,11 +1022,11 @@ public static IEnumerable ArchitectureToValidInstructionSets yield return new InstructionSetInfo("base", "X86Base", InstructionSet.X64_X86Base, true); yield return new InstructionSetInfo("base", "Sse", InstructionSet.X64_X86Base, true); yield return new InstructionSetInfo("base", "Sse2", InstructionSet.X64_X86Base, true); - yield return new InstructionSetInfo("sse4.2", "Sse42", InstructionSet.X64_SSE42, true); - yield return new InstructionSetInfo("sse4.2", "Sse3", InstructionSet.X64_SSE42, true); - yield return new InstructionSetInfo("sse4.2", "Ssse3", InstructionSet.X64_SSE42, true); - yield return new InstructionSetInfo("sse4.2", "Sse41", InstructionSet.X64_SSE42, true); - yield return new InstructionSetInfo("sse4.2", "Popcnt", InstructionSet.X64_SSE42, true); + yield return new InstructionSetInfo("base", "Sse42", InstructionSet.X64_X86Base, true); + yield return new InstructionSetInfo("base", "Sse3", InstructionSet.X64_X86Base, true); + yield return new InstructionSetInfo("base", "Ssse3", InstructionSet.X64_X86Base, true); + yield return new InstructionSetInfo("base", "Sse41", InstructionSet.X64_X86Base, true); + yield return new InstructionSetInfo("base", "Popcnt", InstructionSet.X64_X86Base, true); yield return new InstructionSetInfo("avx", "Avx", InstructionSet.X64_AVX, true); yield return new InstructionSetInfo("avx2", "Avx2", InstructionSet.X64_AVX2, true); yield return new InstructionSetInfo("avx2", "Bmi1", InstructionSet.X64_AVX2, true); @@ -1119,11 +1095,11 @@ public static IEnumerable ArchitectureToValidInstructionSets yield return new InstructionSetInfo("base", "X86Base", InstructionSet.X86_X86Base, true); yield return new InstructionSetInfo("base", "Sse", InstructionSet.X86_X86Base, true); yield return new InstructionSetInfo("base", "Sse2", InstructionSet.X86_X86Base, true); - yield return new InstructionSetInfo("sse4.2", "Sse42", InstructionSet.X86_SSE42, true); - yield return new InstructionSetInfo("sse4.2", "Sse3", InstructionSet.X86_SSE42, true); - yield return new InstructionSetInfo("sse4.2", "Ssse3", InstructionSet.X86_SSE42, true); - yield return new InstructionSetInfo("sse4.2", "Sse41", InstructionSet.X86_SSE42, true); - yield return new InstructionSetInfo("sse4.2", "Popcnt", InstructionSet.X86_SSE42, true); + yield return new InstructionSetInfo("base", "Sse42", InstructionSet.X86_X86Base, true); + yield return new InstructionSetInfo("base", "Sse3", InstructionSet.X86_X86Base, true); + yield return new InstructionSetInfo("base", "Ssse3", InstructionSet.X86_X86Base, true); + yield return new InstructionSetInfo("base", "Sse41", InstructionSet.X86_X86Base, true); + yield return new InstructionSetInfo("base", "Popcnt", InstructionSet.X86_X86Base, true); yield return new InstructionSetInfo("avx", "Avx", InstructionSet.X86_AVX, true); yield return new InstructionSetInfo("avx2", "Avx2", InstructionSet.X86_AVX2, true); yield return new InstructionSetInfo("avx2", "Bmi1", InstructionSet.X86_AVX2, true); @@ -1224,8 +1200,6 @@ public void Set64BitInstructionSetVariants(TargetArchitecture architecture) case TargetArchitecture.X64: if (HasInstructionSet(InstructionSet.X64_X86Base)) AddInstructionSet(InstructionSet.X64_X86Base_X64); - if (HasInstructionSet(InstructionSet.X64_SSE42)) - AddInstructionSet(InstructionSet.X64_SSE42_X64); if (HasInstructionSet(InstructionSet.X64_AVX)) AddInstructionSet(InstructionSet.X64_AVX_X64); if (HasInstructionSet(InstructionSet.X64_AVX2)) @@ -1286,7 +1260,6 @@ public void Set64BitInstructionSetVariantsUnconditionally(TargetArchitecture arc case TargetArchitecture.X64: AddInstructionSet(InstructionSet.X64_X86Base_X64); - AddInstructionSet(InstructionSet.X64_SSE42_X64); AddInstructionSet(InstructionSet.X64_AVX_X64); AddInstructionSet(InstructionSet.X64_AVX2_X64); AddInstructionSet(InstructionSet.X64_AVX512_X64); @@ -1306,7 +1279,6 @@ public void Set64BitInstructionSetVariantsUnconditionally(TargetArchitecture arc case TargetArchitecture.X86: AddInstructionSet(InstructionSet.X86_X86Base_X64); - AddInstructionSet(InstructionSet.X86_SSE42_X64); AddInstructionSet(InstructionSet.X86_AVX_X64); AddInstructionSet(InstructionSet.X86_AVX2_X64); AddInstructionSet(InstructionSet.X86_AVX512_X64); @@ -1479,33 +1451,33 @@ public static InstructionSet LookupPlatformIntrinsicInstructionSet(TargetArchite case "Sse42": if (nestedTypeName == "X64") - { return InstructionSet.X64_SSE42_X64; } + { return InstructionSet.X64_X86Base_X64; } else - { return InstructionSet.X64_SSE42; } + { return InstructionSet.X64_X86Base; } case "Sse3": if (nestedTypeName == "X64") - { return InstructionSet.X64_SSE42_X64; } + { return InstructionSet.X64_X86Base_X64; } else - { return InstructionSet.X64_SSE42; } + { return InstructionSet.X64_X86Base; } case "Ssse3": if (nestedTypeName == "X64") - { return InstructionSet.X64_SSE42_X64; } + { return InstructionSet.X64_X86Base_X64; } else - { return InstructionSet.X64_SSE42; } + { return InstructionSet.X64_X86Base; } case "Sse41": if (nestedTypeName == "X64") - { return InstructionSet.X64_SSE42_X64; } + { return InstructionSet.X64_X86Base_X64; } else - { return InstructionSet.X64_SSE42; } + { return InstructionSet.X64_X86Base; } case "Popcnt": if (nestedTypeName == "X64") - { return InstructionSet.X64_SSE42_X64; } + { return InstructionSet.X64_X86Base_X64; } else - { return InstructionSet.X64_SSE42; } + { return InstructionSet.X64_X86Base; } case "Avx": if (nestedTypeName == "X64") @@ -1800,19 +1772,19 @@ public static InstructionSet LookupPlatformIntrinsicInstructionSet(TargetArchite { return InstructionSet.X86_X86Base; } case "Sse42": - { return InstructionSet.X86_SSE42; } + { return InstructionSet.X86_X86Base; } case "Sse3": - { return InstructionSet.X86_SSE42; } + { return InstructionSet.X86_X86Base; } case "Ssse3": - { return InstructionSet.X86_SSE42; } + { return InstructionSet.X86_X86Base; } case "Sse41": - { return InstructionSet.X86_SSE42; } + { return InstructionSet.X86_X86Base; } case "Popcnt": - { return InstructionSet.X86_SSE42; } + { return InstructionSet.X86_X86Base; } case "Avx": { return InstructionSet.X86_AVX; } diff --git a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt index 24c3f474ab14de..054e5ff1d3e35a 100644 --- a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt +++ b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt @@ -32,11 +32,11 @@ instructionset ,X86 ,X86Base , ,22 ,X86Base instructionset ,X86 ,Sse , ,1 ,X86Base ,base instructionset ,X86 ,Sse2 , ,2 ,X86Base ,base -instructionset ,X86 ,Sse42 , ,6 ,SSE42 ,sse4.2 -instructionset ,X86 ,Sse3 , ,3 ,SSE42 ,sse4.2 -instructionset ,X86 ,Ssse3 , ,4 ,SSE42 ,sse4.2 -instructionset ,X86 ,Sse41 , ,5 ,SSE42 ,sse4.2 -instructionset ,X86 ,Popcnt , ,15 ,SSE42 ,sse4.2 +instructionset ,X86 ,Sse42 , ,6 ,X86Base ,base +instructionset ,X86 ,Sse3 , ,3 ,X86Base ,base +instructionset ,X86 ,Ssse3 , ,4 ,X86Base ,base +instructionset ,X86 ,Sse41 , ,5 ,X86Base ,base +instructionset ,X86 ,Popcnt , ,15 ,X86Base ,base instructionset ,X86 ,Avx , ,7 ,AVX ,avx @@ -122,7 +122,6 @@ instructionset ,X86 ,AvxVnniInt16_V512 , ,63 ,AVXVNNI ; 64-bit Instruction Sets instructionset64bit,X86 ,X86Base -instructionset64bit,X86 ,SSE42 instructionset64bit,X86 ,AVX instructionset64bit,X86 ,AVX2 @@ -153,9 +152,7 @@ vectorinstructionset,X86 ,Vector512 ; Implications -implication ,X86 ,SSE42 ,X86Base - -implication ,X86 ,AVX ,SSE42 +implication ,X86 ,AVX ,X86Base implication ,X86 ,AVX2 ,AVX implication ,X86 ,AVX512 ,AVX2 @@ -175,7 +172,7 @@ implication ,X86 ,AVX512VP2INTERSECT ,AVX512 implication ,X86 ,AVXIFMA ,AVX2 implication ,X86 ,AVXVNNI ,AVX2 -implication ,X86 ,GFNI ,SSE42 +implication ,X86 ,GFNI ,X86Base implication ,X86 ,GFNI_V256 ,GFNI implication ,X86 ,GFNI_V256 ,AVX implication ,X86 ,GFNI_V512 ,GFNI_V256 @@ -264,8 +261,7 @@ implication ,RiscV64 ,Zbb ,RiscV64Base implication ,RiscV64 ,Zba ,RiscV64Base ; ,name and aliases ,archs ,lower baselines included by implication -instructionsetgroup ,x86-64 ,X64 X86 ,base -instructionsetgroup ,x86-64-v2 ,X64 X86 ,x86-64 sse4.2 +instructionsetgroup ,x86-64-v2 ,X64 X86 ,base instructionsetgroup ,x86-64-v3 ,X64 X86 ,x86-64-v2 avx2 instructionsetgroup ,x86-64-v4 ,X64 X86 ,x86-64-v3 avx512 diff --git a/src/coreclr/tools/aot/ILCompiler/Program.cs b/src/coreclr/tools/aot/ILCompiler/Program.cs index 04795875d6d071..003d016d9658ce 100644 --- a/src/coreclr/tools/aot/ILCompiler/Program.cs +++ b/src/coreclr/tools/aot/ILCompiler/Program.cs @@ -108,7 +108,8 @@ public int Run() TargetOS targetOS = Get(_command.TargetOS); InstructionSetSupport instructionSetSupport = Helpers.ConfigureInstructionSetSupport(Get(_command.InstructionSet), Get(_command.MaxVectorTBitWidth), isVectorTOptimistic, targetArchitecture, targetOS, "Unrecognized instruction set {0}", "Unsupported combination of instruction sets: {0}/{1}", logger, - optimizingForSize: _command.OptimizationMode == OptimizationMode.PreferSize); + optimizingForSize: _command.OptimizationMode == OptimizationMode.PreferSize, + isReadyToRun: false); string systemModuleName = Get(_command.SystemModuleName); string reflectionData = Get(_command.ReflectionData); diff --git a/src/coreclr/tools/aot/crossgen2/Program.cs b/src/coreclr/tools/aot/crossgen2/Program.cs index 53404bb9feb440..a8afdcb1e98916 100644 --- a/src/coreclr/tools/aot/crossgen2/Program.cs +++ b/src/coreclr/tools/aot/crossgen2/Program.cs @@ -86,7 +86,9 @@ public int Run() TargetArchitecture targetArchitecture = Get(_command.TargetArchitecture); TargetOS targetOS = Get(_command.TargetOS); InstructionSetSupport instructionSetSupport = Helpers.ConfigureInstructionSetSupport(Get(_command.InstructionSet), Get(_command.MaxVectorTBitWidth), isVectorTOptimistic, targetArchitecture, targetOS, - SR.InstructionSetMustNotBe, SR.InstructionSetInvalidImplication, logger); + SR.InstructionSetMustNotBe, SR.InstructionSetInvalidImplication, logger, + optimizingForSize: _command.OptimizationMode == OptimizationMode.PreferSize, + isReadyToRun: true); SharedGenericsMode genericsMode = SharedGenericsMode.CanonicalReferenceTypes; var targetDetails = new TargetDetails(targetArchitecture, targetOS, Crossgen2RootCommand.IsArmel ? TargetAbi.NativeAotArmel : TargetAbi.NativeAot, instructionSetSupport.GetVectorTSimdVector()); diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp index 03aaed2aa9515f..5dce2f6669ca65 100644 --- a/src/coreclr/vm/codeman.cpp +++ b/src/coreclr/vm/codeman.cpp @@ -1180,6 +1180,19 @@ void EEJitManager::SetCpuInfo() int cpuFeatures = minipal_getcpufeatures(); + if ((cpuFeatures & IntrinsicConstants_Invalid) != 0) + { +#if defined(TARGET_X86) || defined(TARGET_AMD64) + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("\nThe current CPU is missing one or more of the following instruction sets: SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT\n")); +#elif defined(TARGET_ARM64) && (defined(TARGET_WINDOWS) || defined(TARGET_APPLE)) + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("\nThe current CPU is missing one or more of the following instruction sets: AdvSimd, LSE\n")); +#elif defined(TARGET_ARM64) + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("\nThe current CPU is missing one or more of the following instruction sets: AdvSimd\n")); +#else + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("\nThe current CPU is missing one or more of the baseline instruction sets.\n")); +#endif + } + // Get the maximum bitwidth of Vector, rounding down to the nearest multiple of 128-bits uint32_t maxVectorTBitWidth = (CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_MaxVectorTBitWidth) / 128) * 128; @@ -1198,20 +1211,13 @@ void EEJitManager::SetCpuInfo() CPUCompileFlags.Set(InstructionSet_VectorT512); } - // x86-64-v1 + // x86-64-v2 if (CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableHWIntrinsic)) { CPUCompileFlags.Set(InstructionSet_X86Base); } - // x86-64-v2 - - if (((cpuFeatures & XArchIntrinsicConstants_Sse42) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE42)) - { - CPUCompileFlags.Set(InstructionSet_SSE42); - } - // x86-64-v3 if (((cpuFeatures & XArchIntrinsicConstants_Avx) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX)) @@ -6317,7 +6323,7 @@ size_t ReadyToRunJitManager::WalkILOffsets( BoundsType boundsType, void* pContext, size_t (* pfnWalkILOffsets)(ICorDebugInfo::OffsetMapping *pOffsetMapping, void *pContext)) -{ +{ CONTRACTL { THROWS; // on OOM. GC_NOTRIGGER; // getting vars shouldn't trigger diff --git a/src/native/minipal/cpufeatures.c b/src/native/minipal/cpufeatures.c index 9122f8411c9b52..b49336a8843655 100644 --- a/src/native/minipal/cpufeatures.c +++ b/src/native/minipal/cpufeatures.c @@ -238,8 +238,17 @@ int minipal_getcpufeatures(void) bool hasAvx2Dependencies = false; bool hasAvx10v1Dependencies = false; - assert((cpuidInfo[CPUID_EDX] & (1 << 25)) != 0); // SSE - assert((cpuidInfo[CPUID_EDX] & (1 << 26)) != 0); // SSE2 + if (((cpuidInfo[CPUID_EDX] & (1 << 25)) == 0) || // SSE + ((cpuidInfo[CPUID_EDX] & (1 << 26)) == 0) || // SSE2 + ((cpuidInfo[CPUID_ECX] & (1 << 0)) == 0) || // SSE3 + ((cpuidInfo[CPUID_ECX] & (1 << 9)) == 0) || // SSSE3 + ((cpuidInfo[CPUID_ECX] & (1 << 19)) == 0) || // SSE4.1 + ((cpuidInfo[CPUID_ECX] & (1 << 20)) == 0) || // SSE4.2 + ((cpuidInfo[CPUID_ECX] & (1 << 23)) == 0)) // POPCNT + { + // One of the baseline ISAs is not supported + result |= IntrinsicConstants_Invalid; + } if (((cpuidInfo[CPUID_ECX] & (1 << 25)) != 0) && // AESNI ((cpuidInfo[CPUID_ECX] & (1 << 1)) != 0)) // PCLMULQDQ @@ -247,27 +256,18 @@ int minipal_getcpufeatures(void) result |= XArchIntrinsicConstants_Aes; } - if (((cpuidInfo[CPUID_ECX] & (1 << 0)) != 0) && // SSE3 - ((cpuidInfo[CPUID_ECX] & (1 << 9)) != 0) && // SSSE3 - ((cpuidInfo[CPUID_ECX] & (1 << 19)) != 0) && // SSE4.1 - ((cpuidInfo[CPUID_ECX] & (1 << 20)) != 0) && // SSE4.2 - ((cpuidInfo[CPUID_ECX] & (1 << 23)) != 0)) // POPCNT + if (((cpuidInfo[CPUID_ECX] & (1 << 27)) != 0) && // OSXSAVE + ((cpuidInfo[CPUID_ECX] & (1 << 28)) != 0)) // AVX { - result |= XArchIntrinsicConstants_Sse42; - - if (((cpuidInfo[CPUID_ECX] & (1 << 27)) != 0) && // OSXSAVE - ((cpuidInfo[CPUID_ECX] & (1 << 28)) != 0)) // AVX + if (IsAvxEnabled() && (xmmYmmStateSupport() == 1)) // XGETBV == 11 { - if (IsAvxEnabled() && (xmmYmmStateSupport() == 1)) // XGETBV == 11 - { - result |= XArchIntrinsicConstants_Avx; + result |= XArchIntrinsicConstants_Avx; - if (((cpuidInfo[CPUID_ECX] & (1 << 29)) != 0) && // F16C - ((cpuidInfo[CPUID_ECX] & (1 << 12)) != 0) && // FMA - ((cpuidInfo[CPUID_ECX] & (1 << 22)) != 0)) // MOVBE - { - hasAvx2Dependencies = true; - } + if (((cpuidInfo[CPUID_ECX] & (1 << 29)) != 0) && // F16C + ((cpuidInfo[CPUID_ECX] & (1 << 12)) != 0) && // FMA + ((cpuidInfo[CPUID_ECX] & (1 << 22)) != 0)) // MOVBE + { + hasAvx2Dependencies = true; } } } @@ -455,14 +455,18 @@ int minipal_getcpufeatures(void) #if HAVE_AUXV_HWCAP_H unsigned long hwCap = getauxval(AT_HWCAP); - assert(hwCap & HWCAP_ASIMD); + if ((hwCap & HWCAP_ASIMD) == 0) + { + // One of the baseline ISAs is not supported + result |= IntrinsicConstants_Invalid; + } + + if ((hwCap & HWCAP_ATOMICS) != 0) + result |= ARM64IntrinsicConstants_Atomics; if (hwCap & HWCAP_AES) result |= ARM64IntrinsicConstants_Aes; - if (hwCap & HWCAP_ATOMICS) - result |= ARM64IntrinsicConstants_Atomics; - if (hwCap & HWCAP_CRC32) result |= ARM64IntrinsicConstants_Crc32; @@ -498,6 +502,17 @@ int minipal_getcpufeatures(void) int64_t valueFromSysctl = 0; size_t sz = sizeof(valueFromSysctl); + if ((sysctlbyname("hw.optional.AdvSIMD", &valueFromSysctl, &sz, NULL, 0) != 0) || (valueFromSysctl == 0) || + (sysctlbyname("hw.optional.arm.FEAT_LSE", &valueFromSysctl, &sz, NULL, 0) != 0) || (valueFromSysctl == 0)) + { + // One of the baseline ISAs is not supported + result |= IntrinsicConstants_Invalid; + } + else + { + result |= ARM64IntrinsicConstants_Atomics; + } + if ((sysctlbyname("hw.optional.arm.FEAT_AES", &valueFromSysctl, &sz, NULL, 0) == 0) && (valueFromSysctl != 0)) result |= ARM64IntrinsicConstants_Aes; @@ -516,9 +531,6 @@ int minipal_getcpufeatures(void) if ((sysctlbyname("hw.optional.arm.FEAT_SHA256", &valueFromSysctl, &sz, NULL, 0) == 0) && (valueFromSysctl != 0)) result |= ARM64IntrinsicConstants_Sha256; - if ((sysctlbyname("hw.optional.armv8_1_atomics", &valueFromSysctl, &sz, NULL, 0) == 0) && (valueFromSysctl != 0)) - result |= ARM64IntrinsicConstants_Atomics; - if ((sysctlbyname("hw.optional.arm.FEAT_LRCPC", &valueFromSysctl, &sz, NULL, 0) == 0) && (valueFromSysctl != 0)) result |= ARM64IntrinsicConstants_Rcpc; @@ -529,6 +541,17 @@ int minipal_getcpufeatures(void) #endif // HOST_UNIX #if defined(HOST_WINDOWS) + if (!IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) || + !IsProcessorFeaturePresent(PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE)) + { + // One of the baseline ISAs is not supported + result |= IntrinsicConstants_Invalid; + } + else + { + result |= ARM64IntrinsicConstants_Atomics; + } + if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) { result |= ARM64IntrinsicConstants_Aes; @@ -541,11 +564,6 @@ int minipal_getcpufeatures(void) result |= ARM64IntrinsicConstants_Crc32; } - if (IsProcessorFeaturePresent(PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE)) - { - result |= ARM64IntrinsicConstants_Atomics; - } - if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) { result |= ARM64IntrinsicConstants_Dp; @@ -578,7 +596,6 @@ int minipal_getcpufeatures(void) { result |= ARM64IntrinsicConstants_Sve2; } - #endif // HOST_WINDOWS #endif // HOST_ARM64 diff --git a/src/native/minipal/cpufeatures.h b/src/native/minipal/cpufeatures.h index 92284d18899d7e..f751985d540af7 100644 --- a/src/native/minipal/cpufeatures.h +++ b/src/native/minipal/cpufeatures.h @@ -8,28 +8,28 @@ // Should match the constants defined in the compiler in HardwareIntrinsicHelpers.cs // -#if defined(HOST_X86) || defined(HOST_AMD64) -#define XArchIntrinsicConstants_Sse42 (1 << 0) -#define XArchIntrinsicConstants_Avx (1 << 1) -#define XArchIntrinsicConstants_Avx2 (1 << 2) -#define XArchIntrinsicConstants_Avx512 (1 << 3) - -#define XArchIntrinsicConstants_Avx512v2 (1 << 4) -#define XArchIntrinsicConstants_Avx512v3 (1 << 5) -#define XArchIntrinsicConstants_Avx10v1 (1 << 6) -#define XArchIntrinsicConstants_Avx10v2 (1 << 7) -#define XArchIntrinsicConstants_Apx (1 << 8) +// Reserve the last bit to indicate an invalid query, such as if a baseline ISA isn't supported +#define IntrinsicConstants_Invalid (1 << 31) -#define XArchIntrinsicConstants_Aes (1 << 9) -#define XArchIntrinsicConstants_Avx512Vp2intersect (1 << 10) -#define XArchIntrinsicConstants_AvxIfma (1 << 11) -#define XArchIntrinsicConstants_AvxVnni (1 << 12) +#if defined(HOST_X86) || defined(HOST_AMD64) +#define XArchIntrinsicConstants_Avx (1 << 0) +#define XArchIntrinsicConstants_Avx2 (1 << 1) +#define XArchIntrinsicConstants_Avx512 (1 << 2) +#define XArchIntrinsicConstants_Avx512v2 (1 << 3) +#define XArchIntrinsicConstants_Avx512v3 (1 << 4) +#define XArchIntrinsicConstants_Avx10v1 (1 << 5) +#define XArchIntrinsicConstants_Avx10v2 (1 << 6) +#define XArchIntrinsicConstants_Apx (1 << 7) +#define XArchIntrinsicConstants_Aes (1 << 8) +#define XArchIntrinsicConstants_Avx512Vp2intersect (1 << 9) +#define XArchIntrinsicConstants_AvxIfma (1 << 10) +#define XArchIntrinsicConstants_AvxVnni (1 << 11) +#define XArchIntrinsicConstants_AvxVnniInt (1 << 12) #define XArchIntrinsicConstants_Gfni (1 << 13) #define XArchIntrinsicConstants_Sha (1 << 14) #define XArchIntrinsicConstants_Vaes (1 << 15) #define XArchIntrinsicConstants_WaitPkg (1 << 16) #define XArchIntrinsicConstants_X86Serialize (1 << 17) -#define XArchIntrinsicConstants_AvxVnniInt (1 << 18) #endif // HOST_X86 || HOST_AMD64 #if defined(HOST_ARM64) @@ -50,7 +50,6 @@ // Bit position for the ARM64IntrinsicConstants_Atomics flags, to be used with tbz / tbnz instructions #define ARM64_ATOMICS_FEATURE_FLAG_BIT 6 static_assert((1 << ARM64_ATOMICS_FEATURE_FLAG_BIT) == ARM64IntrinsicConstants_Atomics, "ARM64_ATOMICS_FEATURE_FLAG_BIT must match with ARM64IntrinsicConstants_Atomics"); - #endif // HOST_ARM64 #if defined(HOST_RISCV64) diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index 31c7b436dd7be8..d209ae79a54b01 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -22,7 +22,6 @@ DOTNET_EnableAVX2; DOTNET_EnableAVX512; DOTNET_EnableHWIntrinsic; - DOTNET_EnableSSE42; DOTNET_EnableAPX; DOTNET_JitStressEvexEncoding; DOTNET_PreferredVectorBitWidth; @@ -103,26 +102,23 @@ - + - - + - - + - diff --git a/src/tests/JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs b/src/tests/JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs index d97aa84f99c57a..d2023b8a6911a0 100644 --- a/src/tests/JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs +++ b/src/tests/JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs @@ -70,7 +70,7 @@ public unsafe static void CpuId() for (int i = 0; i < 2; i++) { - // SSE, SSE2 are paired + // SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT are paired if (IsBitIncorrect(edx, 25, typeof(Sse), Sse.IsSupported, "HWIntrinsic", ref isHierarchyDisabled)) { @@ -81,59 +81,50 @@ public unsafe static void CpuId() { testResult = Fail; } - } - - bool isBaselineHierarchyDisabled = isHierarchyDisabled; - - for (int i = 0; i < 2; i++) - { - // AES, PCLMULQDQ are paired - if (IsBitIncorrect(ecx, 25, typeof(Aes), Aes.IsSupported, "AES", ref isHierarchyDisabled)) + if (IsBitIncorrect(ecx, 0, typeof(Sse3), Sse3.IsSupported, "HWIntrinsic", ref isHierarchyDisabled)) { testResult = Fail; } - if (IsBitIncorrect(ecx, 1, typeof(Pclmulqdq), Pclmulqdq.IsSupported, "AES", ref isHierarchyDisabled)) + if (IsBitIncorrect(ecx, 9, typeof(Ssse3), Ssse3.IsSupported, "HWIntrinsic", ref isHierarchyDisabled)) { testResult = Fail; } - } - - isHierarchyDisabled = isBaselineHierarchyDisabled; - - for (int i = 0; i < 2; i++) - { - // SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT are paired - if (IsBitIncorrect(ecx, 0, typeof(Sse3), Sse3.IsSupported, "SSE42", ref isHierarchyDisabled)) + if (IsBitIncorrect(ecx, 19, typeof(Sse41), Sse41.IsSupported, "HWIntrinsic", ref isHierarchyDisabled)) { testResult = Fail; } - if (IsBitIncorrect(ecx, 9, typeof(Ssse3), Ssse3.IsSupported, "SSE42", ref isHierarchyDisabled)) + if (IsBitIncorrect(ecx, 20, typeof(Sse42), Sse42.IsSupported, "HWIntrinsic", ref isHierarchyDisabled)) { testResult = Fail; } - if (IsBitIncorrect(ecx, 19, typeof(Sse41), Sse41.IsSupported, "SSE42", ref isHierarchyDisabled)) + if (IsBitIncorrect(ecx, 23, typeof(Popcnt), Popcnt.IsSupported, "HWIntrinsic", ref isHierarchyDisabled)) { testResult = Fail; } + } - if (IsBitIncorrect(ecx, 20, typeof(Sse42), Sse42.IsSupported, "SSE42", ref isHierarchyDisabled)) + bool isBaselineHierarchyDisabled = isHierarchyDisabled; + + for (int i = 0; i < 2; i++) + { + // AES, PCLMULQDQ are paired + + if (IsBitIncorrect(ecx, 25, typeof(Aes), Aes.IsSupported, "AES", ref isHierarchyDisabled)) { testResult = Fail; } - if (IsBitIncorrect(ecx, 23, typeof(Popcnt), Popcnt.IsSupported, "SSE42", ref isHierarchyDisabled)) + if (IsBitIncorrect(ecx, 1, typeof(Pclmulqdq), Pclmulqdq.IsSupported, "AES", ref isHierarchyDisabled)) { testResult = Fail; } } - bool isSse42HierarchyDisabled = isHierarchyDisabled; - if (IsBitIncorrect(ecx, 28, typeof(Avx), Avx.IsSupported, "AVX", ref isHierarchyDisabled)) { testResult = Fail; @@ -280,7 +271,7 @@ public unsafe static void CpuId() testResult = Fail; } - isHierarchyDisabled = isSse42HierarchyDisabled; + isHierarchyDisabled = isBaselineHierarchyDisabled; if (IsBitIncorrect(ecx, 8, typeof(Gfni), Gfni.IsSupported, "GFNI", ref isHierarchyDisabled)) { diff --git a/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/Program.cs b/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/Program.cs index f13e6404e431ec..04862e60d3b91c 100644 --- a/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/Program.cs +++ b/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/Program.cs @@ -48,50 +48,6 @@ static int Main() bool? ExpectedSse2 = true; #if BASELINE_INTRINSICS - bool? ExpectedSse3 = null; - bool? ExpectedSsse3 = null; - bool? ExpectedSse41 = null; - bool? ExpectedSse42 = null; - bool? ExpectedPopcnt = null; - bool? ExpectedAes = null; - bool? ExpectedPclmulqdq = null; - bool? ExpectedGfni = null; - bool? ExpectedSha = null; - bool? ExpectedWaitPkg = null; - bool? ExpectedX86Serialize = null; - - bool? ExpectedAvx = false; - bool? ExpectedAvx2 = false; - bool? ExpectedBmi1 = false; - bool? ExpectedBmi2 = false; - bool? ExpectedF16c = false; - bool? ExpectedFma = false; - bool? ExpectedLzcnt = false; - bool? ExpectedAvx512F = false; - bool? ExpectedAvx512BW = false; - bool? ExpectedAvx512CD = false; - bool? ExpectedAvx512DQ = false; - bool? ExpectedAvx512Vbmi = false; - bool? ExpectedAvx512Bitalg = false; - bool? ExpectedAvx512Vbmi2 = false; - bool? ExpectedAvx512Vpopcntdq = false; - bool? ExpectedAvx512Bf16 = false; - bool? ExpectedAvx512Fp16 = false; - bool? ExpectedAvx10v1 = false; - bool? ExpectedAvx10v1V512 = false; - bool? ExpectedAvx10v2 = false; - bool? ExpectedAvx512Vp2intersect = false; - bool? ExpectedAvxIfma = false; - bool? ExpectedAvxVnni = false; - bool? ExpectedAvxVnniInt = false; - bool? ExpectedAvxVnniIntV512 = false; - bool? ExpectedGfniV256 = false; - bool? ExpectedGfniV512 = false; - bool? ExpectedAesV256 = false; - bool? ExpectedAesV512 = false; - bool? ExpectedPclmulqdqV256 = false; - bool? ExpectedPclmulqdqV512 = false; -#elif SSE42_INTRINSICS bool? ExpectedSse3 = true; bool? ExpectedSsse3 = true; bool? ExpectedSse41 = true; diff --git a/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/X64Sse42.csproj b/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/X64Sse42.csproj deleted file mode 100644 index c322746eb5ebfa..00000000000000 --- a/src/tests/nativeaot/SmokeTests/HardwareIntrinsics/X64Sse42.csproj +++ /dev/null @@ -1,23 +0,0 @@ - - - Exe - 0 - true - - true - - true - true - $(DefineConstants);SSE42_INTRINSICS;VECTORT128_INTRINSICS - true - false - - - - - - - - - - diff --git a/src/tests/nativeaot/SmokeTests/Preinitialization/Preinitialization.cs b/src/tests/nativeaot/SmokeTests/Preinitialization/Preinitialization.cs index a70277b0f85266..aa33b63eaea324 100644 --- a/src/tests/nativeaot/SmokeTests/Preinitialization/Preinitialization.cs +++ b/src/tests/nativeaot/SmokeTests/Preinitialization/Preinitialization.cs @@ -89,11 +89,16 @@ class Simple2 public static bool IsAvxVnniSupported = AvxVnni.IsSupported; } - class Complex + class Simple3 { public static bool IsPopcntSupported = Popcnt.IsSupported; } + class Complex + { + public static bool IsX86SerializeSupported = X86Serialize.IsSupported; + } + public static void Run() { Assert.IsPreinitialized(typeof(Simple1)); @@ -102,11 +107,14 @@ public static void Run() Assert.IsPreinitialized(typeof(Simple2)); Assert.AreEqual(AvxVnni.IsSupported, Simple2.IsAvxVnniSupported); + Assert.IsPreinitialized(typeof(Simple3)); + Assert.AreEqual(Popcnt.IsSupported, Simple3.IsPopcntSupported); + if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64) Assert.IsLazyInitialized(typeof(Complex)); else Assert.IsPreinitialized(typeof(Complex)); - Assert.AreEqual(Popcnt.IsSupported, Complex.IsPopcntSupported); + Assert.AreEqual(X86Serialize.IsSupported, Complex.IsX86SerializeSupported); } } diff --git a/src/tests/readytorun/HardwareIntrinsics/X86/CpuId_R2R_Sse42.csproj b/src/tests/readytorun/HardwareIntrinsics/X86/CpuId_R2R_Sse42.csproj deleted file mode 100644 index 43fa033f3708f8..00000000000000 --- a/src/tests/readytorun/HardwareIntrinsics/X86/CpuId_R2R_Sse42.csproj +++ /dev/null @@ -1,22 +0,0 @@ - - - - true - true - true - - - - true - true - true - - - - $(CrossGen2TestExtraArguments) --instruction-set:sse4.2 - - - - - - diff --git a/src/tests/readytorun/JittedMethodsCountingTest/JittedMethodsCountingTest.cs b/src/tests/readytorun/JittedMethodsCountingTest/JittedMethodsCountingTest.cs index 87c11e7e31a40f..521d012157e690 100644 --- a/src/tests/readytorun/JittedMethodsCountingTest/JittedMethodsCountingTest.cs +++ b/src/tests/readytorun/JittedMethodsCountingTest/JittedMethodsCountingTest.cs @@ -2,11 +2,12 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Runtime; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; +using System.Runtime.InteropServices; using Xunit; -using InteropServices = System.Runtime.InteropServices; -using JitInfo = System.Runtime.JitInfo; - public class JittedMethodsCountingTest { private const int MAX_JITTED_METHODS_ACCEPTED = 70; @@ -43,10 +44,11 @@ private static bool IsReadyToRunEnabled() private static bool IsHardwareIntrinsicsEnabled() { - string? dotnetEnableHWIntrinsics = - Environment.GetEnvironmentVariable("DOTNET_EnableHWIntrinsic"); - - return (string.IsNullOrEmpty(dotnetEnableHWIntrinsics) - || dotnetEnableHWIntrinsics != "0"); + return RuntimeInformation.ProcessArchitecture switch + { + Architecture.X86 or Architecture.X64 => OperatingSystem.IsMacOS() ? Sse42.IsSupported : Avx2.IsSupported, + Architecture.Arm64 => AdvSimd.IsSupported, + _ => true, + }; } }