Skip to content

Commit 717656d

Browse files
committed
simplify code
Signed-off-by: Swati Gupta <[email protected]>
1 parent 407562d commit 717656d

File tree

2 files changed

+86
-67
lines changed

2 files changed

+86
-67
lines changed

cmd/gpu-kubelet-plugin/device_health.go

Lines changed: 80 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -20,130 +20,138 @@ package main
2020
import (
2121
"context"
2222
"fmt"
23-
"sync"
2423

2524
"github.com/NVIDIA/go-nvml/pkg/nvml"
2625
"k8s.io/klog/v2"
2726
)
2827

28+
const (
29+
InstanceID uint32 = 0xFFFFFFFF
30+
)
31+
2932
type deviceHealthMonitor struct {
30-
nvdevlib *deviceLib
31-
allocatable AllocatableDevices
32-
eventSet nvml.EventSet
33-
unhealthy chan *AllocatableDevice
34-
stop chan struct{}
35-
wg sync.WaitGroup
33+
nvmllib nvml.Interface
34+
eventSet nvml.EventSet
35+
unhealthy chan *AllocatableDevice
36+
stop chan struct{}
37+
uuidToDeviceMap map[string]*AllocatableDevice
3638
}
3739

3840
func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*deviceHealthMonitor, error) {
39-
klog.Info("[SWATI DEBUG] initializing NVML..")
40-
if err := nvdevlib.Init(); err != nil {
41-
return nil, fmt.Errorf("failed to initialize NVML: %w", err)
41+
if nvdevlib.nvmllib == nil {
42+
return nil, fmt.Errorf("nvml library is nil")
43+
}
44+
45+
m := &deviceHealthMonitor{
46+
nvmllib: nvdevlib.nvmllib,
47+
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
48+
stop: make(chan struct{}),
4249
}
43-
//defer nvdevlib.alwaysShutdown()
4450

45-
//klog.Info("[SWATI DEBUG] getting all devices..")
46-
//allocatable, err := nvdevlib.enumerateAllPossibleDevices(config)
47-
//if err != nil {
48-
// return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
49-
//}
51+
if r := m.nvmllib.Init(); r != nvml.SUCCESS {
52+
return nil, fmt.Errorf("failed to initialize NVML: %v", r)
53+
}
5054

51-
klog.Info("[SWATI DEBUG] creating NVML events")
52-
eventSet, err := nvdevlib.nvmllib.EventSetCreate()
55+
klog.V(6).Info("creating NVML events for device health monitor")
56+
eventSet, err := m.nvmllib.EventSetCreate()
5357
if err != nvml.SUCCESS {
58+
_ = m.nvmllib.Shutdown()
5459
return nil, fmt.Errorf("failed to create event set: %w", err)
5560
}
61+
m.eventSet = eventSet
5662

57-
monitor := &deviceHealthMonitor{
58-
nvdevlib: nvdevlib,
59-
allocatable: allocatable,
60-
eventSet: eventSet,
61-
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
62-
stop: make(chan struct{}),
63-
}
63+
m.uuidToDeviceMap = getUuidToDeviceMap(allocatable)
6464

65-
klog.Info("[SWATI DEBUG] registering NVML events")
66-
if err := monitor.registerDevicesForEvents(); err != nil {
67-
monitor.eventSet.Free()
68-
return nil, fmt.Errorf("failed to register devices for health monitoring: %w", err)
69-
}
65+
klog.V(6).Info("registering NVML events for device health monitor")
66+
m.registerDevicesForEvents()
7067

71-
monitor.start()
72-
return monitor, nil
68+
klog.V(6).Info("started device health monitoring")
69+
go m.run()
70+
71+
return m, nil
7372
}
7473

75-
func (m *deviceHealthMonitor) registerDevicesForEvents() error {
76-
nvmllib := m.nvdevlib.nvmllib
74+
func (m *deviceHealthMonitor) registerDevicesForEvents() {
75+
// TODO: add a list of xids to ignore
7776
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)
7877

79-
for _, uuid := range m.allocatable.UUIDs() {
80-
gpu, err := nvmllib.DeviceGetHandleByUUID(uuid)
78+
for uuid, dev := range m.uuidToDeviceMap {
79+
// if its a mig device, get its Parent UUID
80+
if dev.Type() == MigDeviceType {
81+
uuid = dev.Mig.parent.UUID
82+
}
83+
gpu, err := m.nvmllib.DeviceGetHandleByUUID(uuid)
8184
if err != nvml.SUCCESS {
82-
klog.Infof("Unable to get NVML handle for UUID %s: %v; skipping health check for this device", uuid, err)
85+
klog.Infof("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", uuid, err)
86+
m.unhealthy <- dev
8387
continue
8488
}
8589

86-
if err := gpu.RegisterEvents(eventMask, m.eventSet); err != nvml.SUCCESS {
87-
klog.Infof("Failed to register events for device %s: %v; skipping health check for this device", uuid, err)
90+
supportedEvents, err := gpu.GetSupportedEventTypes()
91+
if err != nvml.SUCCESS {
92+
klog.Infof("unable to determine the supported events for %s: %v; marking it as unhealthy", uuid, err)
93+
m.unhealthy <- dev
94+
continue
8895
}
89-
}
90-
return nil
91-
}
9296

93-
func (m *deviceHealthMonitor) start() {
94-
klog.Info("[SWATI DEBUG] starting health monitor")
95-
m.wg.Add(1)
96-
go m.run()
97+
err = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
98+
if err == nvml.ERROR_NOT_SUPPORTED {
99+
klog.Warningf("Device %v is too old to support healthchecking.", uuid)
100+
}
101+
if err != nvml.SUCCESS {
102+
klog.Infof("unable to register events for %s: %v; marking it as unhealthy", uuid, err)
103+
m.unhealthy <- dev
104+
}
105+
106+
}
97107
}
98108

99109
func (m *deviceHealthMonitor) Stop() {
100110
if m == nil {
101111
return
102112
}
103-
klog.Info("[SWATI DEBUG] stopping health monitor")
113+
klog.V(6).Info("stopping health monitor")
114+
104115
close(m.stop)
105-
m.wg.Wait()
106-
close(m.unhealthy)
107116
m.eventSet.Free()
108117

109-
if m.nvdevlib != nil {
110-
m.nvdevlib.alwaysShutdown()
118+
if r := m.nvmllib.Shutdown(); r != nvml.SUCCESS {
119+
klog.Warningf("failed to shutdown NVML: %v", r)
111120
}
121+
close(m.unhealthy)
112122
}
113123

114-
func (m *deviceHealthMonitor) run() {
115-
defer m.wg.Done()
116-
124+
func getUuidToDeviceMap(allocatable AllocatableDevices) map[string]*AllocatableDevice {
117125
uuidToDeviceMap := make(map[string]*AllocatableDevice)
118-
for _, device := range m.allocatable {
126+
for _, device := range allocatable {
119127
uuid := device.GetUUID()
120128
if uuid != "" {
121129
uuidToDeviceMap[uuid] = device
122130
}
123131
}
132+
return uuidToDeviceMap
133+
}
124134

125-
klog.Info("Starting event-driven GPU health monitor...")
126-
135+
func (m *deviceHealthMonitor) run() {
127136
for {
128137
select {
129138
case <-m.stop:
130-
klog.Info("Stopping event-driven GPU health monitor...")
139+
klog.V(6).Info("Stopping event-driven GPU health monitor...")
131140
return
132141
default:
133142
event, err := m.eventSet.Wait(5000)
134143
if err == nvml.ERROR_TIMEOUT {
135-
klog.Info("[SWATI DEBUG] timedout")
136144
continue
137145
}
138146
if err != nvml.SUCCESS {
139147
klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", err)
140-
for _, dev := range m.allocatable {
148+
for _, dev := range m.uuidToDeviceMap {
141149
m.unhealthy <- dev
142150
}
143151
continue
144152
}
145153

146-
// Process health events
154+
klog.Infof("Processing event %+v", event)
147155
switch event.EventType {
148156
case nvml.EventTypeXidCriticalError:
149157
klog.Warningf("Critical XID error detected on device: %+v", event)
@@ -158,16 +166,25 @@ func (m *deviceHealthMonitor) run() {
158166
eventUUID, err := event.Device.GetUUID()
159167
if err != nvml.SUCCESS {
160168
klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, err)
161-
for _, dev := range m.allocatable {
169+
for _, dev := range m.uuidToDeviceMap {
162170
m.unhealthy <- dev
163171
}
164172
continue
165173
}
166174

167-
device, exists := uuidToDeviceMap[eventUUID]
175+
device, exists := m.uuidToDeviceMap[eventUUID]
168176
if !exists {
177+
klog.Infof("Ignoring event for unexpected device: %v", eventUUID)
169178
continue
170179
}
180+
if device.Type() == MigDeviceType && event.GpuInstanceId != InstanceID && event.ComputeInstanceId != InstanceID {
181+
giID := device.Mig.giInfo.Id
182+
ciID := device.Mig.ciInfo.Id
183+
if giID != event.GpuInstanceId || ciID != event.ComputeInstanceId {
184+
continue
185+
}
186+
klog.Infof("Event for mig device %v (giID=%v, ciID=%v)", device.Mig.UUID, giID, ciID)
187+
}
171188

172189
// Send notification to driver
173190
klog.Infof("Sending unhealthy notification for device %s due to event type %v", eventUUID, event.EventType)

cmd/gpu-kubelet-plugin/driver.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
105105
return nil, err
106106
}
107107

108-
go driver.handleHealthNotifications(ctx, config.flags.nodeName)
108+
go driver.deviceHealthEvents(ctx, config.flags.nodeName)
109109

110110
return driver, nil
111111
}
@@ -191,12 +191,12 @@ func (d *driver) nodeUnprepareResource(ctx context.Context, claimNs kubeletplugi
191191
return nil
192192
}
193193

194-
func (d *driver) handleHealthNotifications(ctx context.Context, nodeName string) {
195-
klog.Info("[SWATI DEBUG] handling Health Notifications")
194+
func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
195+
klog.Info("Processing device health notifications")
196196
for {
197197
select {
198198
case <-ctx.Done():
199-
klog.Info("Stopping health notification handler")
199+
klog.Info("Stop processing device health notifications")
200200
return
201201
case device, ok := <-d.deviceHealthMonitor.Unhealthy():
202202
if !ok {
@@ -222,6 +222,8 @@ func (d *driver) handleHealthNotifications(ctx context.Context, nodeName string)
222222
}
223223

224224
// Republish updated resources
225+
// TODO: 1. remove this.
226+
// 2. Add device taints
225227
klog.Info("[SWATI DEBUG] rebulishing resourceslice with healthy devices")
226228
resources := resourceslice.DriverResources{
227229
Pools: map[string]resourceslice.Pool{

0 commit comments

Comments
 (0)