@@ -20,130 +20,138 @@ package main
20
20
import (
21
21
"context"
22
22
"fmt"
23
- "sync"
24
23
25
24
"github.com/NVIDIA/go-nvml/pkg/nvml"
26
25
"k8s.io/klog/v2"
27
26
)
28
27
28
+ const (
29
+ InstanceID uint32 = 0xFFFFFFFF
30
+ )
31
+
29
32
type deviceHealthMonitor struct {
30
- nvdevlib * deviceLib
31
- allocatable AllocatableDevices
32
- eventSet nvml.EventSet
33
- unhealthy chan * AllocatableDevice
34
- stop chan struct {}
35
- wg sync.WaitGroup
33
+ nvmllib nvml.Interface
34
+ eventSet nvml.EventSet
35
+ unhealthy chan * AllocatableDevice
36
+ stop chan struct {}
37
+ uuidToDeviceMap map [string ]* AllocatableDevice
36
38
}
37
39
38
40
func newDeviceHealthMonitor (ctx context.Context , config * Config , allocatable AllocatableDevices , nvdevlib * deviceLib ) (* deviceHealthMonitor , error ) {
39
- klog .Info ("[SWATI DEBUG] initializing NVML.." )
40
- if err := nvdevlib .Init (); err != nil {
41
- return nil , fmt .Errorf ("failed to initialize NVML: %w" , err )
41
+ if nvdevlib .nvmllib == nil {
42
+ return nil , fmt .Errorf ("nvml library is nil" )
43
+ }
44
+
45
+ m := & deviceHealthMonitor {
46
+ nvmllib : nvdevlib .nvmllib ,
47
+ unhealthy : make (chan * AllocatableDevice , len (allocatable )),
48
+ stop : make (chan struct {}),
42
49
}
43
- //defer nvdevlib.alwaysShutdown()
44
50
45
- //klog.Info("[SWATI DEBUG] getting all devices..")
46
- //allocatable, err := nvdevlib.enumerateAllPossibleDevices(config)
47
- //if err != nil {
48
- // return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
49
- //}
51
+ if r := m .nvmllib .Init (); r != nvml .SUCCESS {
52
+ return nil , fmt .Errorf ("failed to initialize NVML: %v" , r )
53
+ }
50
54
51
- klog .Info ("[SWATI DEBUG] creating NVML events" )
52
- eventSet , err := nvdevlib .nvmllib .EventSetCreate ()
55
+ klog .V ( 6 ). Info ("creating NVML events for device health monitor " )
56
+ eventSet , err := m .nvmllib .EventSetCreate ()
53
57
if err != nvml .SUCCESS {
58
+ _ = m .nvmllib .Shutdown ()
54
59
return nil , fmt .Errorf ("failed to create event set: %w" , err )
55
60
}
61
+ m .eventSet = eventSet
56
62
57
- monitor := & deviceHealthMonitor {
58
- nvdevlib : nvdevlib ,
59
- allocatable : allocatable ,
60
- eventSet : eventSet ,
61
- unhealthy : make (chan * AllocatableDevice , len (allocatable )),
62
- stop : make (chan struct {}),
63
- }
63
+ m .uuidToDeviceMap = getUuidToDeviceMap (allocatable )
64
64
65
- klog .Info ("[SWATI DEBUG] registering NVML events" )
66
- if err := monitor .registerDevicesForEvents (); err != nil {
67
- monitor .eventSet .Free ()
68
- return nil , fmt .Errorf ("failed to register devices for health monitoring: %w" , err )
69
- }
65
+ klog .V (6 ).Info ("registering NVML events for device health monitor" )
66
+ m .registerDevicesForEvents ()
70
67
71
- monitor .start ()
72
- return monitor , nil
68
+ klog .V (6 ).Info ("started device health monitoring" )
69
+ go m .run ()
70
+
71
+ return m , nil
73
72
}
74
73
75
- func (m * deviceHealthMonitor ) registerDevicesForEvents () error {
76
- nvmllib := m . nvdevlib . nvmllib
74
+ func (m * deviceHealthMonitor ) registerDevicesForEvents () {
75
+ // TODO: add a list of xids to ignore
77
76
eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError )
78
77
79
- for _ , uuid := range m .allocatable .UUIDs () {
80
- gpu , err := nvmllib .DeviceGetHandleByUUID (uuid )
78
+ for uuid , dev := range m .uuidToDeviceMap {
79
+ // if its a mig device, get its Parent UUID
80
+ if dev .Type () == MigDeviceType {
81
+ uuid = dev .Mig .parent .UUID
82
+ }
83
+ gpu , err := m .nvmllib .DeviceGetHandleByUUID (uuid )
81
84
if err != nvml .SUCCESS {
82
- klog .Infof ("Unable to get NVML handle for UUID %s: %v; skipping health check for this device" , uuid , err )
85
+ klog .Infof ("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy" , uuid , err )
86
+ m .unhealthy <- dev
83
87
continue
84
88
}
85
89
86
- if err := gpu .RegisterEvents (eventMask , m .eventSet ); err != nvml .SUCCESS {
87
- klog .Infof ("Failed to register events for device %s: %v; skipping health check for this device" , uuid , err )
90
+ supportedEvents , err := gpu .GetSupportedEventTypes ()
91
+ if err != nvml .SUCCESS {
92
+ klog .Infof ("unable to determine the supported events for %s: %v; marking it as unhealthy" , uuid , err )
93
+ m .unhealthy <- dev
94
+ continue
88
95
}
89
- }
90
- return nil
91
- }
92
96
93
- func (m * deviceHealthMonitor ) start () {
94
- klog .Info ("[SWATI DEBUG] starting health monitor" )
95
- m .wg .Add (1 )
96
- go m .run ()
97
+ err = gpu .RegisterEvents (eventMask & supportedEvents , m .eventSet )
98
+ if err == nvml .ERROR_NOT_SUPPORTED {
99
+ klog .Warningf ("Device %v is too old to support healthchecking." , uuid )
100
+ }
101
+ if err != nvml .SUCCESS {
102
+ klog .Infof ("unable to register events for %s: %v; marking it as unhealthy" , uuid , err )
103
+ m .unhealthy <- dev
104
+ }
105
+
106
+ }
97
107
}
98
108
99
109
func (m * deviceHealthMonitor ) Stop () {
100
110
if m == nil {
101
111
return
102
112
}
103
- klog .Info ("[SWATI DEBUG] stopping health monitor" )
113
+ klog .V (6 ).Info ("stopping health monitor" )
114
+
104
115
close (m .stop )
105
- m .wg .Wait ()
106
- close (m .unhealthy )
107
116
m .eventSet .Free ()
108
117
109
- if m . nvdevlib != nil {
110
- m . nvdevlib . alwaysShutdown ( )
118
+ if r := m . nvmllib . Shutdown (); r != nvml . SUCCESS {
119
+ klog . Warningf ( "failed to shutdown NVML: %v" , r )
111
120
}
121
+ close (m .unhealthy )
112
122
}
113
123
114
- func (m * deviceHealthMonitor ) run () {
115
- defer m .wg .Done ()
116
-
124
+ func getUuidToDeviceMap (allocatable AllocatableDevices ) map [string ]* AllocatableDevice {
117
125
uuidToDeviceMap := make (map [string ]* AllocatableDevice )
118
- for _ , device := range m . allocatable {
126
+ for _ , device := range allocatable {
119
127
uuid := device .GetUUID ()
120
128
if uuid != "" {
121
129
uuidToDeviceMap [uuid ] = device
122
130
}
123
131
}
132
+ return uuidToDeviceMap
133
+ }
124
134
125
- klog .Info ("Starting event-driven GPU health monitor..." )
126
-
135
+ func (m * deviceHealthMonitor ) run () {
127
136
for {
128
137
select {
129
138
case <- m .stop :
130
- klog .Info ("Stopping event-driven GPU health monitor..." )
139
+ klog .V ( 6 ). Info ("Stopping event-driven GPU health monitor..." )
131
140
return
132
141
default :
133
142
event , err := m .eventSet .Wait (5000 )
134
143
if err == nvml .ERROR_TIMEOUT {
135
- klog .Info ("[SWATI DEBUG] timedout" )
136
144
continue
137
145
}
138
146
if err != nvml .SUCCESS {
139
147
klog .Infof ("Error waiting for event: %v; Marking all devices as unhealthy" , err )
140
- for _ , dev := range m .allocatable {
148
+ for _ , dev := range m .uuidToDeviceMap {
141
149
m .unhealthy <- dev
142
150
}
143
151
continue
144
152
}
145
153
146
- // Process health events
154
+ klog . Infof ( "Processing event %+v" , event )
147
155
switch event .EventType {
148
156
case nvml .EventTypeXidCriticalError :
149
157
klog .Warningf ("Critical XID error detected on device: %+v" , event )
@@ -158,16 +166,25 @@ func (m *deviceHealthMonitor) run() {
158
166
eventUUID , err := event .Device .GetUUID ()
159
167
if err != nvml .SUCCESS {
160
168
klog .Infof ("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy." , event , err )
161
- for _ , dev := range m .allocatable {
169
+ for _ , dev := range m .uuidToDeviceMap {
162
170
m .unhealthy <- dev
163
171
}
164
172
continue
165
173
}
166
174
167
- device , exists := uuidToDeviceMap [eventUUID ]
175
+ device , exists := m . uuidToDeviceMap [eventUUID ]
168
176
if ! exists {
177
+ klog .Infof ("Ignoring event for unexpected device: %v" , eventUUID )
169
178
continue
170
179
}
180
+ if device .Type () == MigDeviceType && event .GpuInstanceId != InstanceID && event .ComputeInstanceId != InstanceID {
181
+ giID := device .Mig .giInfo .Id
182
+ ciID := device .Mig .ciInfo .Id
183
+ if giID != event .GpuInstanceId || ciID != event .ComputeInstanceId {
184
+ continue
185
+ }
186
+ klog .Infof ("Event for mig device %v (giID=%v, ciID=%v)" , device .Mig .UUID , giID , ciID )
187
+ }
171
188
172
189
// Send notification to driver
173
190
klog .Infof ("Sending unhealthy notification for device %s due to event type %v" , eventUUID , event .EventType )
0 commit comments