Skip to content

Commit f8cb1e9

Browse files
mszhanyiankitm3k
authored andcommitted
Build CUDA and DML together (microsoft#22602)
### Description Now, we need to build cuda and dml in one package. But CUDA EP and DML EP can't run in one process. It will throw the exception of `the GPU device instance has been suspended` So the issue is CUDA EP and DML EP coexist in compile time but can't exist in run time. This PR is to split cuda ep test and dml ep test in all unit tests. The solution is to use 2 environment variable, NO_CUDA_TEST and NO_DML_TEST, in CI. For example, if NO_CUDA_TEST is set, the DefaultCudaExecutionProvider will be nullptr, and the test will not run with CUDA EP. In debugging, the CUDAExecutionProvider will not be called. I think, as long as cuda functions, like cudaSetDevice, are not called, DML EP tests can pass. Disabled java test of testDIrectML because it doesn't work now even without CUDA EP.
1 parent 13fcfa6 commit f8cb1e9

24 files changed

+308
-29
lines changed

java/src/test/java/ai/onnxruntime/InferenceTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,7 @@ public void testCoreML() throws OrtException {
737737
runProvider(OrtProvider.CORE_ML);
738738
}
739739

740+
@Disabled("DirectML Java API hasn't been supported yet")
740741
@Test
741742
@EnabledIfSystemProperty(named = "USE_DML", matches = "1")
742743
public void testDirectML() throws OrtException {

onnxruntime/test/common/cuda_op_test_utils.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55

66
#include "test/util/include/default_providers.h"
77

8+
#define SKIP_CUDA_TEST_WITH_DML \
9+
if (DefaultCudaExecutionProvider() == nullptr) { \
10+
GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled"; \
11+
}
12+
813
namespace onnxruntime {
914
namespace test {
1015

@@ -13,6 +18,10 @@ namespace test {
1318
int GetCudaArchitecture();
1419

1520
inline bool HasCudaEnvironment(int min_cuda_architecture) {
21+
if (DefaultCudaExecutionProvider() == nullptr) {
22+
return false;
23+
}
24+
1625
if (DefaultCudaExecutionProvider().get() == nullptr) {
1726
return false;
1827
}

onnxruntime/test/contrib_ops/beam_search_test.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
7373
const char* const output_names[] = {"sequences"};
7474

7575
Ort::SessionOptions session_options;
76+
#if defined(USE_CUDA) && defined(USE_DML)
77+
SKIP_CUDA_TEST_WITH_DML;
78+
#endif
7679
#ifdef USE_CUDA
7780
OrtCUDAProviderOptionsV2 cuda_options;
7881
cuda_options.use_tf32 = false;
@@ -166,6 +169,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
166169
bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
167170
if (enable_cuda || enable_rocm) {
168171
Ort::SessionOptions session_options;
172+
#if defined(USE_CUDA) && defined(USE_DML)
173+
SKIP_CUDA_TEST_WITH_DML;
174+
#endif
169175
#ifdef USE_CUDA
170176
OrtCUDAProviderOptionsV2 cuda_options;
171177
cuda_options.use_tf32 = false;

onnxruntime/test/contrib_ops/bias_dropout_op_test.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,9 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
181181
t.SetCustomOutputVerifier(output_verifier);
182182
std::vector<std::unique_ptr<IExecutionProvider>> t_eps;
183183
#ifdef USE_CUDA
184+
if (DefaultCudaExecutionProvider() == nullptr) {
185+
return;
186+
}
184187
t_eps.emplace_back(DefaultCudaExecutionProvider());
185188
#elif USE_ROCM
186189
t_eps.emplace_back(DefaultRocmExecutionProvider());

onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ void RunTestForInference(const std::vector<int64_t>& input_dims, bool has_ratio
6161

6262
std::vector<std::unique_ptr<IExecutionProvider>> test_eps;
6363
#ifdef USE_CUDA
64-
test_eps.emplace_back(DefaultCudaExecutionProvider());
64+
if (DefaultCudaExecutionProvider() != nullptr) {
65+
test_eps.emplace_back(DefaultCudaExecutionProvider());
66+
}
6567
#elif USE_ROCM
6668
test_eps.emplace_back(DefaultRocmExecutionProvider());
6769
#endif
@@ -122,6 +124,9 @@ void RunTestForTraining(const std::vector<int64_t>& input_dims) {
122124

123125
std::vector<std::unique_ptr<IExecutionProvider>> dropout_eps;
124126
#ifdef USE_CUDA
127+
if (DefaultCudaExecutionProvider() == nullptr) {
128+
return;
129+
}
125130
dropout_eps.emplace_back(DefaultCudaExecutionProvider());
126131
#elif USE_ROCM
127132
dropout_eps.emplace_back(DefaultRocmExecutionProvider());

onnxruntime/test/contrib_ops/layer_norm_test.cc

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Licensed under the MIT License.
33

44
#include "test/providers/compare_provider_test_utils.h"
5+
#include "test/util/include/default_providers.h"
56

67
namespace onnxruntime {
78
namespace test {
@@ -79,14 +80,20 @@ static void TestLayerNorm(const std::vector<int64_t>& x_dims,
7980
#endif
8081

8182
#ifdef USE_CUDA
82-
test.CompareWithCPU(kCudaExecutionProvider);
83+
if (DefaultCudaExecutionProvider() != nullptr) {
84+
test.CompareWithCPU(kCudaExecutionProvider);
85+
}
8386
#elif USE_ROCM
8487
test.CompareWithCPU(kRocmExecutionProvider);
85-
#elif USE_DML
86-
test.CompareWithCPU(kDmlExecutionProvider);
8788
#elif USE_WEBGPU
8889
test.CompareWithCPU(kWebGpuExecutionProvider);
8990
#endif
91+
92+
#ifdef USE_DML
93+
if (DefaultDmlExecutionProvider() != nullptr) {
94+
test.CompareWithCPU(kDmlExecutionProvider);
95+
}
96+
#endif
9097
}
9198

9299
TEST(CudaKernelTest, LayerNorm_NullInput) {

onnxruntime/test/contrib_ops/matmul_4bits_test.cc

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -489,13 +489,17 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
489489
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
490490
if (use_float16) {
491491
#ifdef USE_CUDA
492-
execution_providers.push_back(DefaultCudaExecutionProvider());
492+
if (DefaultCudaExecutionProvider() != nullptr) {
493+
execution_providers.push_back(DefaultCudaExecutionProvider());
494+
}
493495
#endif
494496
#ifdef USE_ROCM
495497
execution_providers.push_back(DefaultRocmExecutionProvider());
496498
#endif
497499
#ifdef USE_DML
498-
execution_providers.push_back(DefaultDmlExecutionProvider());
500+
if (DefaultDmlExecutionProvider() != nullptr) {
501+
execution_providers.push_back(DefaultDmlExecutionProvider());
502+
}
499503
#endif
500504
#ifdef USE_WEBGPU
501505
execution_providers.push_back(DefaultWebGpuExecutionProvider());
@@ -513,8 +517,11 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
513517
} // namespace
514518

515519
TEST(MatMulNBits, Float16Cuda) {
516-
#if defined(USE_CUDA) || defined(USE_ROCM)
517-
auto has_gidx_options = {true, false};
520+
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
521+
std::vector<bool> has_gidx_options = {true, false};
522+
if (DefaultDmlExecutionProvider() != nullptr) {
523+
has_gidx_options.assign(1, false);
524+
}
518525
#else
519526
auto has_gidx_options = {false};
520527
#endif
@@ -525,7 +532,9 @@ TEST(MatMulNBits, Float16Cuda) {
525532
for (auto block_size : {16, 32, 64, 128}) {
526533
for (auto has_gidx : has_gidx_options) {
527534
#ifdef USE_DML
528-
RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
535+
if (DefaultDmlExecutionProvider() != nullptr) {
536+
RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
537+
}
529538
#else
530539
RunTest(M, N, K, block_size, 0, false, true, has_gidx);
531540
RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
@@ -538,12 +547,16 @@ TEST(MatMulNBits, Float16Cuda) {
538547
}
539548

540549
TEST(MatMulNBits, Float16Large) {
541-
#ifdef USE_DML
550+
#if defined(USE_CUDA) || defined(USE_DML)
542551
// For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail
543552
// machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an
544553
// absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number
545554
// of elements in this test, ULPs should probably be used instead of absolute/relative tolerances.
546-
float abs_error = 0.3f;
555+
float abs_error = 0.05f;
556+
if (DefaultDmlExecutionProvider() != nullptr) {
557+
// it means the ep is dml in runtime, the abs_error is changed to 0.3f
558+
abs_error = 0.3f;
559+
}
547560
#elif USE_WEBGPU
548561
// See Intel A770 to pass these tests with an absolute error of 0.08.
549562
float abs_error = 0.08f;
@@ -559,7 +572,6 @@ TEST(MatMulNBits, Float16Large) {
559572
}
560573
}
561574
}
562-
563575
#endif // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
564576
} // namespace test
565577
} // namespace onnxruntime

onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
227227
}
228228

229229
// DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
230-
#if defined(USE_DML)
230+
#if defined(USE_DML) && !defined(USE_CUDA)
231231

232232
TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
233233
RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();

onnxruntime/test/contrib_ops/tensor_op_test.cc

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,15 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz
121121
test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
122122
test.AddInput<float>("input", {N, C, H, W}, X);
123123
test.AddOutput<float>("output", {N, C, H, W}, result);
124+
#if defined(USE_CUDA) && defined(USE_DML)
125+
if (DefaultCudaExecutionProvider() == nullptr) {
126+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
127+
} else if (DefaultDmlExecutionProvider() == nullptr) {
128+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
129+
}
130+
#else
124131
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider}); // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
132+
#endif
125133
}
126134

127135
void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) {
@@ -188,7 +196,15 @@ void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_va
188196
test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
189197
test.AddInput<float>("input", {N, C, H, W}, X);
190198
test.AddOutput<float>("output", {N, C, H, W}, result);
199+
#if defined(USE_CUDA) && defined(USE_DML)
200+
if (DefaultCudaExecutionProvider() == nullptr) {
201+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
202+
} else if (DefaultDmlExecutionProvider() == nullptr) {
203+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
204+
}
205+
#else
191206
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider}); // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
207+
#endif
192208
}
193209

194210
TEST(MVNContribOpTest, MeanVarianceNormalizationCPUTest_Version1_TO_8) {
@@ -230,7 +246,9 @@ TEST(UnfoldTensorOpTest, LastDim) {
230246

231247
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
232248
#ifdef USE_CUDA
233-
execution_providers.push_back(DefaultCudaExecutionProvider());
249+
if (DefaultCudaExecutionProvider() != nullptr) {
250+
execution_providers.push_back(DefaultCudaExecutionProvider());
251+
}
234252
#endif
235253
execution_providers.push_back(DefaultCpuExecutionProvider());
236254
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);

0 commit comments

Comments
 (0)