Skip to content

Commit f80aa9a

Browse files
committed
featuregate healthcheck
Signed-off-by: Swati Gupta <[email protected]>
1 parent 0e6516d commit f80aa9a

File tree

6 files changed

+30
-10
lines changed

6 files changed

+30
-10
lines changed

cmd/gpu-kubelet-plugin/device_health.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable All
7474
}
7575

7676
func (m *deviceHealthMonitor) registerDevicesForEvents() {
77-
// TODO: add a list of xids to ignore
77+
// SWATI: 1. add a list of xids to ignore
78+
// 2. Skip ECC errors
7879
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)
7980

8081
processedUUIDs := make(map[string]bool)

cmd/gpu-kubelet-plugin/device_state.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,7 @@ func GetOpaqueDeviceConfigs(
610610
}
611611

612612
func (s *DeviceState) MarkDeviceUnhealthy(device *AllocatableDevice) {
613+
// SWATI: check if a mig device is marked properly
613614
s.Lock()
614615
defer s.Unlock()
615616

cmd/gpu-kubelet-plugin/driver.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
"k8s.io/dynamic-resource-allocation/resourceslice"
3535
"k8s.io/klog/v2"
3636

37+
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/featuregates"
3738
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flock"
3839
)
3940

@@ -98,19 +99,21 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
9899
}
99100
driver.healthcheck = healthcheck
100101

101-
deviceHealthMonitor, err := newDeviceHealthMonitor(ctx, config, state.allocatable, state.nvdevlib)
102-
if err != nil {
103-
return nil, fmt.Errorf("start deviceHealthMonitor: %w", err)
104-
}
102+
if featuregates.Enabled(featuregates.DeviceHealthCheck) {
103+
104+
deviceHealthMonitor, err := newDeviceHealthMonitor(ctx, config, state.allocatable, state.nvdevlib)
105+
if err != nil {
106+
return nil, fmt.Errorf("start deviceHealthMonitor: %w", err)
107+
}
105108

106-
driver.deviceHealthMonitor = deviceHealthMonitor
109+
driver.deviceHealthMonitor = deviceHealthMonitor
110+
go driver.deviceHealthEvents(ctx, config.flags.nodeName)
111+
}
107112

108113
if err := driver.pluginhelper.PublishResources(ctx, resources); err != nil {
109114
return nil, err
110115
}
111116

112-
go driver.deviceHealthEvents(ctx, config.flags.nodeName)
113-
114117
return driver, nil
115118
}
116119

deployments/helm/nvidia-dra-driver-gpu/templates/rbac.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@ rules:
2222
verbs: ["get", "list", "watch", "create", "update", "delete"]
2323
- apiGroups: ["resource.k8s.io"]
2424
resources: ["resourceclaims/status"]
25-
verbs: ["update"]
25+
verbs:
26+
- "update"
27+
{{- if .Values.featureGates.DeviceHealthCheck }}
28+
- "patch"
29+
{{- end }}
2630
- apiGroups: [""]
2731
resources: ["nodes"]
2832
verbs: ["get", "list", "watch", "update"]

deployments/helm/nvidia-dra-driver-gpu/values.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ resources:
6767
# ContextualLogging: true # Kubernetes logging feature (enabled by default)
6868
# LoggingAlphaOptions: false # Kubernetes logging alpha features
6969
# LoggingBetaOptions: true # Kubernetes logging beta features
70-
featureGates: {}
70+
featureGates:
71+
DeviceHealthCheck: false
7172

7273
# Webhook configuration
7374
webhook:

pkg/featuregates/featuregates.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ const (
3333

3434
// MPSSupport allows MPS (Multi-Process Service) settings to be specified.
3535
MPSSupport featuregate.Feature = "MPSSupport"
36+
37+
// DeviceHealthCheck allows Device Health Checking
38+
DeviceHealthCheck featuregate.Feature = "DeviceHealthCheck"
3639
)
3740

3841
// FeatureGates is a singleton representing the set of all feature gates and their values.
@@ -56,6 +59,13 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.VersionedSpecs{
5659
Version: version.MajorMinor(25, 8),
5760
},
5861
},
62+
DeviceHealthCheck: {
63+
{
64+
Default: false,
65+
PreRelease: featuregate.Alpha,
66+
Version: version.MajorMinor(25, 8),
67+
},
68+
},
5969
}
6070

6171
// init instantiates and sets the singleton 'FeatureGates' variable with newFeatureGates().

0 commit comments

Comments
 (0)