Skip to content

Commit 0ffbf94

Browse files
fix: Changes in the Device Reset Implementation
Prior to the return of object in use error, the device is bind and init again Related-To: NEO-10946 Signed-off-by: Bari, Pratik <[email protected]> Source: 6d7e3dc
1 parent fbdabb1 commit 0ffbf94

File tree

7 files changed

+171
-2
lines changed

7 files changed

+171
-2
lines changed

level_zero/sysman/source/api/global_operations/linux/sysman_os_global_operations_imp.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,21 @@ ze_result_t LinuxGlobalOperationsImp::resetImpl(ze_bool_t force, zes_reset_type_
345345
for (auto &&pid : deviceUsingPids) {
346346
while (pProcfsAccess->isAlive(pid)) {
347347
if (std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() > resetTimeout) {
348+
349+
if (resetType == ZES_RESET_TYPE_FLR || resetType == ZES_RESET_TYPE_COLD) {
350+
result = pSysfsAccess->bindDevice(resetName);
351+
if (ZE_RESULT_SUCCESS != result) {
352+
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to bind the device to the kernel driver and returning error:0x%x \n", __FUNCTION__, result);
353+
return result;
354+
}
355+
}
356+
357+
result = pLinuxSysmanImp->reInitSysmanDeviceResources();
358+
if (ZE_RESULT_SUCCESS != result) {
359+
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to init the device and returning error:0x%x \n", __FUNCTION__, result);
360+
return result;
361+
}
362+
348363
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Timeout reached, device still in use and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
349364
return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE;
350365
}

level_zero/sysman/source/shared/linux/zes_os_sysman_imp.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2023 Intel Corporation
2+
* Copyright (C) 2023-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -466,13 +466,15 @@ ze_result_t LinuxSysmanImp::osWarmReset() {
466466
"Card Bus remove after resizing VF bar failed\n");
467467
return result;
468468
}
469+
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds to make sure that the config spaces of all devices are saved correctly.
469470

470471
result = pFsAccess->write(rootPortPath + '/' + "rescan", "1");
471472
if (ZE_RESULT_SUCCESS != result) {
472473
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
473474
"Rescanning root port failed after resizing VF bar failed\n");
474475
return result;
475476
}
477+
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds, allows the rescan to complete on all devices attached to the root port.
476478
}
477479
return result;
478480
}

level_zero/sysman/test/unit_tests/sources/global_operations/linux/mock_global_operations.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,12 @@ struct MockGlobalOpsLinuxSysmanImp : public L0::Sysman::LinuxSysmanImp {
635635
void setMockInitDeviceError(ze_result_t result) {
636636
mockInitDeviceError = result;
637637
}
638+
ze_result_t reInitSysmanDeviceResources() override {
639+
if (mockInitDeviceError != ZE_RESULT_SUCCESS) {
640+
return mockInitDeviceError;
641+
}
642+
return ZE_RESULT_SUCCESS;
643+
}
638644
};
639645

640646
constexpr int mockFdGlobalOperations = 33;

level_zero/sysman/test/unit_tests/sources/global_operations/linux/test_zes_global_operations.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class SysmanGlobalOperationsFixture : public SysmanDeviceFixture {
5454
std::unique_ptr<MockGlobalOperationsSysfsAccess> pSysfsAccess;
5555
std::unique_ptr<MockGlobalOperationsProcfsAccess> pProcfsAccess;
5656
std::unique_ptr<MockGlobalOperationsFsAccess> pFsAccess;
57+
std::unique_ptr<MockGlobalOpsLinuxSysmanImp> pMockGlobalOpsLinuxSysmanImp;
5758
L0::Sysman::EngineHandleContext *pEngineHandleContextOld = nullptr;
5859
L0::Sysman::DiagnosticsHandleContext *pDiagnosticsHandleContextOld = nullptr;
5960
L0::Sysman::FirmwareHandleContext *pFirmwareHandleContextOld = nullptr;
@@ -85,6 +86,7 @@ class SysmanGlobalOperationsFixture : public SysmanDeviceFixture {
8586
pDiagnosticsHandleContext = std::make_unique<MockGlobalOperationsDiagnosticsHandleContext>(pOsSysman);
8687
pFirmwareHandleContext = std::make_unique<MockGlobalOperationsFirmwareHandleContext>(pOsSysman);
8788
pRasHandleContext = std::make_unique<MockGlobalOperationsRasHandleContext>(pOsSysman);
89+
pMockGlobalOpsLinuxSysmanImp = std::make_unique<MockGlobalOpsLinuxSysmanImp>(pLinuxSysmanImp->getSysmanDeviceImp());
8890

8991
auto pDrmLocal = new DrmGlobalOpsMock(const_cast<NEO::RootDeviceEnvironment &>(pSysmanDeviceImp->getRootDeviceEnvironment()));
9092
pDrmLocal->setupIoctlHelper(pSysmanDeviceImp->getRootDeviceEnvironment().getHardwareInfo()->platform.eProductFamily);
@@ -772,6 +774,22 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingzesDeviceResetE
772774
EXPECT_EQ(pFsAccess->mockFlrValue, "1");
773775
}
774776

777+
TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingZesDeviceResetExtForColdResetThenErrorIsReturned) {
778+
initGlobalOps();
779+
pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0];
780+
pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd;
781+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate
782+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED);
783+
pProcfsAccess->isRepeated.push_back(false);
784+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE);
785+
pProcfsAccess->isRepeated.push_back(true);
786+
pProcfsAccess->mockNoKill = true;
787+
pSysfsAccess->mockBindDeviceError = ZE_RESULT_ERROR_NOT_AVAILABLE;
788+
zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_COLD};
789+
ze_result_t result = zesDeviceResetExt(device, &pProperties);
790+
EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, result);
791+
}
792+
775793
TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInvalidTypeThenFailureIsReturned) {
776794
init(true);
777795
DebugManagerStateRestore dbgRestore;
@@ -836,6 +854,51 @@ TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceInUseWhenCallingReset
836854
EXPECT_EQ(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, result);
837855
}
838856

857+
TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceInUseAndBindingFailsDuringResetWhenCallingResetThenErrorIsReturned) {
858+
859+
initGlobalOps();
860+
861+
pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0];
862+
pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd;
863+
864+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate
865+
866+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED);
867+
pProcfsAccess->isRepeated.push_back(false);
868+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE);
869+
pProcfsAccess->isRepeated.push_back(true);
870+
pProcfsAccess->mockNoKill = true;
871+
pSysfsAccess->mockBindDeviceError = ZE_RESULT_ERROR_NOT_AVAILABLE;
872+
ze_result_t result = zesDeviceReset(device, true);
873+
EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, result);
874+
}
875+
876+
TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseAndReInitFailsDuringResetWhenCallingResetThenErrorIsReturned) {
877+
878+
initGlobalOps();
879+
pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0];
880+
pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd;
881+
882+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get();
883+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate
884+
885+
pMockGlobalOpsLinuxSysmanImp->pProcfsAccess = pProcfsAccess.get();
886+
pMockGlobalOpsLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
887+
pMockGlobalOpsLinuxSysmanImp->pFsAccess = pFsAccess.get();
888+
889+
pMockGlobalOpsLinuxSysmanImp->ourDevicePid = pProcfsAccess->ourDevicePid;
890+
pMockGlobalOpsLinuxSysmanImp->ourDeviceFd = pProcfsAccess->ourDevicePid;
891+
pMockGlobalOpsLinuxSysmanImp->setMockInitDeviceError(ZE_RESULT_ERROR_UNKNOWN);
892+
893+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED);
894+
pProcfsAccess->isRepeated.push_back(false);
895+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE);
896+
pProcfsAccess->isRepeated.push_back(true);
897+
pProcfsAccess->mockNoKill = true;
898+
ze_result_t result = zesDeviceReset(device, true);
899+
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, result);
900+
}
901+
839902
TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceNotInUseWhenCallingResetThenSuccessIsReturned) {
840903

841904
// Pretend we have the device open

level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,9 +265,25 @@ ze_result_t LinuxGlobalOperationsImp::resetImpl(ze_bool_t force, zes_reset_type_
265265
for (auto &&pid : deviceUsingPids) {
266266
while (pProcfsAccess->isAlive(pid)) {
267267
if (std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() > resetTimeout) {
268+
269+
if (resetType == ZES_RESET_TYPE_FLR || resetType == ZES_RESET_TYPE_COLD) {
270+
result = pSysfsAccess->bindDevice(resetName);
271+
if (ZE_RESULT_SUCCESS != result) {
272+
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to bind the device to the kernel driver and returning error:0x%x \n", __FUNCTION__, result);
273+
return result;
274+
}
275+
}
276+
277+
result = pLinuxSysmanImp->initDevice();
278+
if (ZE_RESULT_SUCCESS != result) {
279+
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed to init the device and returning error:0x%x \n", __FUNCTION__, result);
280+
return result;
281+
}
282+
268283
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Timeout reached, device still in use and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE);
269284
return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE;
270285
}
286+
271287
struct ::timespec timeout = {.tv_sec = 0, .tv_nsec = 1000};
272288
::nanosleep(&timeout, NULL);
273289
end = std::chrono::steady_clock::now();

level_zero/tools/source/sysman/linux/os_sysman_imp.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2023 Intel Corporation
2+
* Copyright (C) 2020-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -516,13 +516,15 @@ ze_result_t LinuxSysmanImp::osWarmReset() {
516516
"Card Bus remove after resizing VF bar failed\n");
517517
return result;
518518
}
519+
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds to make sure that the config spaces of all devices are saved correctly.
519520

520521
result = pFsAccess->write(rootPortPath + '/' + "rescan", "1");
521522
if (ZE_RESULT_SUCCESS != result) {
522523
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
523524
"Rescanning root port failed after resizing VF bar failed\n");
524525
return result;
525526
}
527+
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds, allows the rescan to complete on all devices attached to the root port.
526528
}
527529
return result;
528530
}

level_zero/tools/test/unit_tests/sources/sysman/global_operations/linux/test_zes_global_operations.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,26 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingzesDeviceResetE
168168
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
169169
}
170170

171+
TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingZesDeviceResetExtForColdResetThenErrorIsReturned) {
172+
initGlobalOps();
173+
pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0];
174+
pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd;
175+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get();
176+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp->pDevice = pLinuxSysmanImp->getDeviceHandle();
177+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate
178+
pMockGlobalOpsLinuxSysmanImp->ourDevicePid = pProcfsAccess->ourDevicePid;
179+
pMockGlobalOpsLinuxSysmanImp->ourDeviceFd = pProcfsAccess->ourDevicePid;
180+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED);
181+
pProcfsAccess->isRepeated.push_back(false);
182+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE);
183+
pProcfsAccess->isRepeated.push_back(true);
184+
pProcfsAccess->mockNoKill = true;
185+
pSysfsAccess->mockBindDeviceError = ZE_RESULT_ERROR_NOT_AVAILABLE;
186+
zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_COLD};
187+
ze_result_t result = zesDeviceResetExt(device, &pProperties);
188+
EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, result);
189+
}
190+
171191
TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInvalidTypeThenFailureIsReturned) {
172192
DebugManagerStateRestore dbgRestore;
173193
debugManager.flags.VfBarResourceAllocationWa.set(false);
@@ -837,6 +857,51 @@ TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceInUseWhenCallingReset
837857
EXPECT_EQ(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE, result);
838858
}
839859

860+
TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceInUseAndBindingFailsDuringResetWhenCallingResetThenErrorIsReturned) {
861+
862+
initGlobalOps();
863+
pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0]; // make sure it isn't our process id
864+
pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd;
865+
866+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get();
867+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp->pDevice = pLinuxSysmanImp->getDeviceHandle();
868+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate
869+
870+
pMockGlobalOpsLinuxSysmanImp->ourDevicePid = pProcfsAccess->ourDevicePid;
871+
pMockGlobalOpsLinuxSysmanImp->ourDeviceFd = pProcfsAccess->ourDevicePid;
872+
873+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED);
874+
pProcfsAccess->isRepeated.push_back(false);
875+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE);
876+
pProcfsAccess->isRepeated.push_back(true);
877+
pProcfsAccess->mockNoKill = true;
878+
pSysfsAccess->mockBindDeviceError = ZE_RESULT_ERROR_NOT_AVAILABLE;
879+
ze_result_t result = zesDeviceReset(device, true);
880+
EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, result);
881+
}
882+
883+
TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseAndReInitFailsDuringResetWhenCallingResetThenErrorIsReturned) {
884+
885+
initGlobalOps();
886+
pProcfsAccess->ourDevicePid = pProcfsAccess->pidList[0]; // make sure it isn't our process id
887+
pProcfsAccess->ourDeviceFd = pProcfsAccess->extraFd;
888+
889+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp = pMockGlobalOpsLinuxSysmanImp.get();
890+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->pLinuxSysmanImp->pDevice = pLinuxSysmanImp->getDeviceHandle();
891+
static_cast<PublicLinuxGlobalOperationsImp *>(pGlobalOperationsImp->pOsGlobalOperations)->resetTimeout = 0; // timeout immediate
892+
893+
pMockGlobalOpsLinuxSysmanImp->ourDevicePid = pProcfsAccess->ourDevicePid;
894+
pMockGlobalOpsLinuxSysmanImp->ourDeviceFd = pProcfsAccess->ourDevicePid;
895+
pMockGlobalOpsLinuxSysmanImp->setMockInitDeviceError(ZE_RESULT_ERROR_UNKNOWN);
896+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_UNUSED);
897+
pProcfsAccess->isRepeated.push_back(false);
898+
pProcfsAccess->mockListProcessCall.push_back(DEVICE_IN_USE);
899+
pProcfsAccess->isRepeated.push_back(true);
900+
pProcfsAccess->mockNoKill = true;
901+
ze_result_t result = zesDeviceReset(device, true);
902+
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, result);
903+
}
904+
840905
TEST_F(SysmanGlobalOperationsIntegratedFixture, GivenDeviceNotInUseWhenCallingResetThenSuccessIsReturned) {
841906

842907
// Pretend we have the device open

0 commit comments

Comments
 (0)