-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Allow configuration template to disable some SIMD. #20215
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
41b5568
18f9c69
9e115a7
d14e4b3
7a7dbd0
74adb0e
d01bcc8
a2090b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -315,7 +315,7 @@ Return Value: | |
__cpuid(1, Cpuid1[0], Cpuid1[1], Cpuid1[2], Cpuid1[3]); | ||
#endif | ||
|
||
#if defined(_MSC_VER) | ||
#if defined(_MSC_VER) && !defined(ORT_DISABLE_SSE4) | ||
|
||
// | ||
// Check if the processor supports SSE 4.1 instructions. | ||
|
@@ -328,7 +328,7 @@ Return Value: | |
this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchSse41; | ||
} | ||
|
||
#endif | ||
#endif // defined(_MSC_VER) && !defined(ORT_DISABLE_SSE4) | ||
|
||
// | ||
// Check if the processor supports the AVX and OSXSAVE features. | ||
|
@@ -348,10 +348,13 @@ Return Value: | |
|
||
if ((xcr0 & 0x6) == 0x6) { | ||
|
||
#if !defined(ORT_DISABLE_AVX) | ||
this->GemmFloatKernel = MlasGemmFloatKernelAvx; | ||
#endif // !defined(ORT_DISABLE_AVX) | ||
|
||
#if defined(MLAS_TARGET_AMD64) | ||
|
||
#if !defined(ORT_DISABLE_AVX) | ||
this->KernelM1Routine = MlasSgemmKernelM1Avx; | ||
this->KernelM1TransposeBRoutine = MlasSgemmKernelM1TransposeBAvx; | ||
this->TransposePackB16x4Routine = MlasSgemmTransposePackB16x4Avx; | ||
|
@@ -368,7 +371,7 @@ Return Value: | |
this->ReduceMaximumF32Kernel = MlasReduceMaximumF32KernelAvx; | ||
this->ReduceMinimumMaximumF32Kernel = MlasReduceMinimumMaximumF32KernelAvx; | ||
this->GemmU8U8Kernel = nullptr; | ||
|
||
#endif // !defined(ORT_DISABLE_AVX) | ||
// | ||
// Check if the processor supports AVX2/FMA3 features. | ||
// | ||
|
@@ -381,7 +384,7 @@ Return Value: | |
#endif | ||
|
||
if (((Cpuid1[2] & 0x1000) != 0) && ((Cpuid7[1] & 0x20) != 0)) { | ||
|
||
#if !defined(ORT_DISABLE_AVX2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this ifdef block also includes FMA3 kernels. should those also be controlled by the macro to disable AVX2? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, I've re-grouped the FMA3 kernels together and moved them outside the block. |
||
this->Avx2Supported_ = true; | ||
|
||
this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchAvx2; | ||
|
@@ -390,6 +393,17 @@ Return Value: | |
this->GemmU8U8Dispatch = &MlasGemmU8U8DispatchAvx2; | ||
this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx2; | ||
this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx2; | ||
this->QLinearAddS8Kernel = MlasQLinearAddS8KernelAvx2; | ||
this->QLinearAddU8Kernel = MlasQLinearAddU8KernelAvx2; | ||
this->ConvDepthwiseU8S8Kernel = MlasConvDepthwiseKernelAvx2<uint8_t, int8_t>; | ||
this->ConvDepthwiseU8U8Kernel = MlasConvDepthwiseKernelAvx2<uint8_t, uint8_t>; | ||
this->ConvDepthwiseS8S8Kernel = MlasConvDepthwiseKernelAvx2<int8_t, int8_t>; | ||
this->ConvDepthwiseS8U8Kernel = MlasConvDepthwiseKernelAvx2<int8_t, uint8_t>; | ||
this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2; | ||
this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelAvx2; | ||
this->CastF32ToF16Kernel = &MlasCastF32ToF16KernelAvx2; | ||
this->RopeDispatch = &MlasRopeDispatchAvx2; | ||
#endif // !defined(ORT_DISABLE_AVX2) | ||
|
||
this->GemmFloatKernel = MlasGemmFloatKernelFma3; | ||
this->GemmDoubleKernel = MlasGemmDoubleKernelFma3; | ||
|
@@ -401,18 +415,7 @@ Return Value: | |
this->LogisticKernelRoutine = MlasComputeLogisticF32KernelFma3; | ||
this->TanhKernelRoutine = MlasComputeTanhF32KernelFma3; | ||
this->ErfKernelRoutine = MlasErfKernelFma3; | ||
this->QLinearAddS8Kernel = MlasQLinearAddS8KernelAvx2; | ||
this->QLinearAddU8Kernel = MlasQLinearAddU8KernelAvx2; | ||
this->ConvDepthwiseU8S8Kernel = MlasConvDepthwiseKernelAvx2<uint8_t, int8_t>; | ||
this->ConvDepthwiseU8U8Kernel = MlasConvDepthwiseKernelAvx2<uint8_t, uint8_t>; | ||
this->ConvDepthwiseS8S8Kernel = MlasConvDepthwiseKernelAvx2<int8_t, int8_t>; | ||
this->ConvDepthwiseS8U8Kernel = MlasConvDepthwiseKernelAvx2<int8_t, uint8_t>; | ||
this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelFma3; | ||
this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2; | ||
this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelAvx2; | ||
this->CastF32ToF16Kernel = &MlasCastF32ToF16KernelAvx2; | ||
this->RopeDispatch = &MlasRopeDispatchAvx2; | ||
|
||
|
||
// | ||
// Check if the processor supports Hybrid core architecture. | ||
|
@@ -433,6 +436,7 @@ Return Value: | |
__cpuid_count(7, 1, Cpuid7_1[0], Cpuid7_1[1], Cpuid7_1[2], Cpuid7_1[3]); | ||
#endif | ||
|
||
#if !defined(ORT_DISABLE_AVX2) | ||
if ((Cpuid7_1[0] & 0x10) != 0) { | ||
|
||
this->GemmU8U8Dispatch = &MlasGemmU8S8DispatchAvx2; | ||
|
@@ -441,9 +445,11 @@ Return Value: | |
this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvxVnni; | ||
this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2vnni; | ||
} | ||
#endif // !defined(ORT_DISABLE_AVX2) | ||
|
||
#if !defined(ORT_MINIMAL_BUILD) | ||
|
||
#if !defined(ORT_DISABLE_AVX512) | ||
// | ||
// Check if the processor supports AVX512F features and the | ||
// operating system supports saving AVX512F state. | ||
|
@@ -499,7 +505,10 @@ Return Value: | |
} | ||
} | ||
} | ||
#endif // !defined(ORT_DISABLE_AVX512) | ||
|
||
|
||
#if !defined(ORT_DISABLE_AVX2) | ||
// | ||
// Check if the processor supports AVX-VNNI-INT8 | ||
// | ||
|
@@ -510,18 +519,22 @@ Return Value: | |
this->GemmS8U8Dispatch = &MlasGemmS8U8DispatchAvx2Vnni; | ||
this->GemmS8U8Kernel = MlasGemmS8U8KernelAvx2Vnni; | ||
} | ||
#endif // !defined(ORT_DISABLE_AVX2) | ||
|
||
#ifndef __APPLE__ | ||
#if !defined(__APPLE__) | ||
#if (defined(_MSC_VER) && (_MSC_VER >= 1933)) || (defined(__GNUC__) && (__GNUC__ >= 13)) | ||
#if !defined(ORT_DISABLE_AVX) | ||
// | ||
// Check if the processor supports AVX NE CONVERT. | ||
// | ||
if ((Cpuid7_1[3] & (0b1 << 5)) != 0) { | ||
this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelAvx; | ||
} | ||
#endif // !defined(ORT_DISABLE_AVX) | ||
#endif // (defined(_MSC_VER) && (_MSC_VER >= 1933)) || (defined(__GNUC__) && (__GNUC__ >= 13)) | ||
|
||
|
||
#if !defined(ORT_DISABLE_AMX) | ||
// | ||
// Check if the processor supports AMX-TILE and AMX-INT8 | ||
// features. | ||
|
@@ -534,14 +547,16 @@ Return Value: | |
this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchAmx; | ||
} | ||
} | ||
#endif // __APPLE__ | ||
#endif // !defined(ORT_DISABLE_AMX) | ||
#endif // !defined(__APPLE__) | ||
|
||
#endif // ORT_MINIMAL_BUILD | ||
|
||
} | ||
|
||
#endif // MLAS_TARGET_AMD64 | ||
|
||
|
||
} | ||
} | ||
|
||
|
@@ -797,4 +812,4 @@ thread_local size_t ThreadedBufSize = 0; | |
thread_local std::unique_ptr<uint8_t, decltype(&_aligned_free)> ThreadedBufHolder(nullptr, &_aligned_free); | ||
#else | ||
thread_local std::unique_ptr<uint8_t, decltype(&free)> ThreadedBufHolder(nullptr, &free); | ||
#endif | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where do these macros come from?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried to follow the way enabling/disabling certain things was done in the code (for instance: ORT_ENABLE_STREAM). Then the macros should be defined with a '-D CMAKE_CXX_FLAGS=ORT_DISABLE_AVX2'.
Would there be a preferred way of doing this? Should it be done through cmake option() ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be preferable to make them CMake options. They should be documented too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure? I don't want to overcrowd the cmake options. That would be 5 more options.
If you're positive, I will add the options and documentation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think they should be documented. it's fine to add additional CMake options. these are additional build configuration options so it seems like a reasonable place for them.