-
Notifications
You must be signed in to change notification settings - Fork 89
Draft: Add more logs to computedomain manager #285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
cc65465
7fe7ff2
5321f9c
6c543dc
1b23a9b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ type CleanupManager[T metav1.Object] struct { | |
} | ||
|
||
func NewCleanupManager[T metav1.Object](informer cache.SharedIndexInformer, getComputeDomain GetComputeDomainFunc, callback CleanupCallback[T]) *CleanupManager[T] { | ||
klog.Infof("Creating new Cleanup Manager for %T", *new(T)) | ||
return &CleanupManager[T]{ | ||
informer: informer, | ||
getComputeDomain: getComputeDomain, | ||
|
@@ -61,19 +62,25 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) { | |
case <-ticker.C: | ||
klog.V(6).Infof("Running periodic sync to remove %T objects owned by stale ComputeDomain", *new(T)) | ||
store := m.informer.GetStore() | ||
for _, item := range store.List() { | ||
items := store.List() | ||
klog.V(6).Infof("Found %d items to check for cleanup", len(items)) | ||
|
||
for _, item := range items { | ||
obj, ok := item.(T) | ||
if !ok { | ||
klog.V(6).Infof("Expected object %T but got %T, skipping..", *new(T), obj) | ||
continue | ||
} | ||
|
||
labels := obj.GetLabels() | ||
if labels == nil { | ||
klog.V(6).Infof("Object %T has no labels, skipping..", *new(T)) | ||
continue | ||
} | ||
|
||
uid, exists := labels[computeDomainLabelKey] | ||
if !exists { | ||
klog.V(6).Infof("Object %T does not have ComputeDomain label, skipping..", *new(T)) | ||
continue | ||
} | ||
|
||
|
@@ -84,6 +91,7 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) { | |
} | ||
|
||
if computeDomain != nil { | ||
klog.V(6).Infof("ComputeDomain with UID %s still exists, skipping cleanup", uid) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
continue | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import ( | |
"sync" | ||
"time" | ||
|
||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/client-go/tools/cache" | ||
"k8s.io/klog/v2" | ||
|
@@ -64,11 +65,13 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager { | |
factory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, informerResyncPeriod) | ||
informer := factory.Resource().V1beta1().ComputeDomains().Informer() | ||
|
||
klog.Infof("Creating new ComputeDomainManager for %s/%s", config.driverName, config.driverNamespace) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test:
|
||
m := &ComputeDomainManager{ | ||
config: config, | ||
factory: factory, | ||
informer: informer, | ||
} | ||
// TODO (swati) add logs for daemonset and resourceClaimTemplate managers in verbose mode | ||
m.daemonSetManager = NewDaemonSetManager(config, m.Get) | ||
m.resourceClaimTemplateManager = NewWorkloadResourceClaimTemplateManager(config, m.Get) | ||
|
||
|
@@ -147,6 +150,7 @@ func (m *ComputeDomainManager) Get(uid string) (*nvapi.ComputeDomain, error) { | |
return nil, fmt.Errorf("error retrieving ComputeDomain by UID: %w", err) | ||
} | ||
if len(cds) == 0 { | ||
klog.V(2).Infof("No ComputeDomain found with UID: %s", uid) | ||
return nil, nil | ||
} | ||
if len(cds) != 1 { | ||
|
@@ -166,11 +170,12 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string) | |
return fmt.Errorf("error retrieving ComputeDomain: %w", err) | ||
} | ||
if cd == nil { | ||
klog.V(2).Infof("ComputeDomain with UID %s not found, nothing to do", uid) | ||
return nil | ||
} | ||
|
||
if cd.GetDeletionTimestamp() == nil { | ||
return fmt.Errorf("attempting to remove finalizer before ComputeDomain marked for deletion") | ||
return fmt.Errorf("attempting to remove finalizer before ComputeDomain %s/%s with UID %s marked for deletion", cd.Namespace, cd.Name, uid) | ||
} | ||
|
||
newCD := cd.DeepCopy() | ||
|
@@ -181,6 +186,7 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string) | |
} | ||
} | ||
if len(cd.Finalizers) == len(newCD.Finalizers) { | ||
klog.V(2).Infof("Finalizer not found on ComputeDomain %s/%s, nothing to do", cd.Namespace, cd.Name) | ||
return nil | ||
} | ||
|
||
|
@@ -191,6 +197,20 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string) | |
return nil | ||
} | ||
|
||
// logNodesWithComputeDomainLabel logs nodes that have a ComputeDomain label and returns their names. | ||
func (m *ComputeDomainManager) logNodesWithComputeDomainLabel(nodes *corev1.NodeList, cdUID string) []string { | ||
if len(nodes.Items) == 0 { | ||
klog.Infof("No nodes found with label for ComputeDomain with UID %s", cdUID) | ||
return nil | ||
} | ||
|
||
nodeNames := []string{} | ||
for _, node := range nodes.Items { | ||
nodeNames = append(nodeNames, node.Name) | ||
} | ||
return nodeNames | ||
} | ||
|
||
// AssertWorkloadsCompletes ensures that all workloads asssociated with a ComputeDomain have completed. | ||
// | ||
// TODO: We should probably also check to ensure that all ResourceClaims | ||
|
@@ -215,9 +235,9 @@ func (m *ComputeDomainManager) AssertWorkloadsCompleted(ctx context.Context, cdU | |
} | ||
|
||
if len(nodes.Items) != 0 { | ||
return fmt.Errorf("nodes exist with label for ComputeDomain %s", cdUID) | ||
nodeNames := m.logNodesWithComputeDomainLabel(nodes, cdUID) | ||
return fmt.Errorf("nodes %v with label for ComputeDomain %s", nodeNames, cdUID) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,8 @@ import ( | |
"context" | ||
"fmt" | ||
|
||
"k8s.io/klog/v2" | ||
|
||
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags" | ||
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/workqueue" | ||
) | ||
|
@@ -57,6 +59,8 @@ func NewController(config *Config) *Controller { | |
// It initializes the work queue, starts the ComputeDomain manager, and handles | ||
// graceful shutdown when the context is cancelled. | ||
func (c *Controller) Run(ctx context.Context) error { | ||
klog.Info("Starting ComputeDomain Controller") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test:
|
||
|
||
workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter()) | ||
|
||
managerConfig := &ManagerConfig{ | ||
|
@@ -78,5 +82,6 @@ func (c *Controller) Run(ctx context.Context) error { | |
return fmt.Errorf("error stopping ComputeDomain manager: %w", err) | ||
} | ||
|
||
klog.Info("ComputeDomain Controller is shutdown") | ||
return nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,6 +86,7 @@ func NewDaemonSetManager(config *ManagerConfig, getComputeDomain GetComputeDomai | |
|
||
informer := factory.Apps().V1().DaemonSets().Informer() | ||
|
||
klog.Infof("Creating new DaemonSetManager for driver %s/%s", config.driverNamespace, config.driverName) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test log:
|
||
m := &DaemonSetManager{ | ||
config: config, | ||
getComputeDomain: getComputeDomain, | ||
|
@@ -162,7 +163,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva | |
return nil, fmt.Errorf("error retrieving DaemonSet: %w", err) | ||
} | ||
if len(ds) > 1 { | ||
return nil, fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") | ||
return nil, fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID %s", cd.UID) | ||
} | ||
if len(ds) == 1 { | ||
return ds[0], nil | ||
|
@@ -209,6 +210,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva | |
return nil, fmt.Errorf("error creating DaemonSet: %w", err) | ||
} | ||
|
||
klog.V(2).Infof("Successfully created DaemonSet %s/%s for ComputeDomain %s/%s", d.Namespace, d.Name, cd.Namespace, cd.Name) | ||
return d, nil | ||
} | ||
|
||
|
@@ -218,9 +220,10 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error { | |
return fmt.Errorf("error retrieving DaemonSet: %w", err) | ||
} | ||
if len(ds) > 1 { | ||
return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") | ||
return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID %s", cdUID) | ||
} | ||
if len(ds) == 0 { | ||
klog.V(2).Infof("No DaemonSet found for ComputeDomain UID %s, nothing to delete", cdUID) | ||
return nil | ||
} | ||
|
||
|
@@ -231,6 +234,7 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error { | |
} | ||
|
||
if d.GetDeletionTimestamp() != nil { | ||
klog.V(2).Infof("DaemonSet %s/%s is already marked for deletion", d.Namespace, d.Name) | ||
return nil | ||
} | ||
|
||
|
@@ -239,6 +243,7 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error { | |
return fmt.Errorf("erroring deleting DaemonSet: %w", err) | ||
} | ||
|
||
klog.V(2).Infof("Successfully deleted DaemonSet %s/%s for ComputeDomain UID %s", d.Namespace, d.Name, cdUID) | ||
return nil | ||
} | ||
|
||
|
@@ -271,6 +276,7 @@ func (m *DaemonSetManager) removeFinalizer(ctx context.Context, cdUID string) er | |
return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") | ||
} | ||
if len(ds) == 0 { | ||
klog.V(2).Infof("No DaemonSet found for ComputeDomain UID %s, nothing to remove finalizer from", cdUID) | ||
return nil | ||
} | ||
|
||
|
@@ -288,6 +294,7 @@ func (m *DaemonSetManager) removeFinalizer(ctx context.Context, cdUID string) er | |
} | ||
} | ||
if len(d.Finalizers) == len(newD.Finalizers) { | ||
klog.V(2).Infof("Finalizer %s not found on DaemonSet %s/%s", computeDomainFinalizer, d.Namespace, d.Name) | ||
return nil | ||
} | ||
|
||
|
@@ -322,10 +329,12 @@ func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error { | |
return fmt.Errorf("error getting ComputeDomain: %w", err) | ||
} | ||
if cd == nil { | ||
klog.V(2).Info("No ComputeDomain found, skipping processing") | ||
return nil | ||
} | ||
|
||
if int(d.Status.NumberReady) != cd.Spec.NumNodes { | ||
klog.V(2).Infof("DaemonSet %s/%s has %d ready nodes, expecting %d, waiting for all nodes to be ready", d.Namespace, d.Name, d.Status.NumberReady, cd.Spec.NumNodes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test log:
|
||
return nil | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import ( | |
|
||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/client-go/tools/cache" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
func uidIndexer[T metav1.ObjectMetaAccessor](obj any) ([]string, error) { | ||
|
@@ -43,6 +44,7 @@ func addComputeDomainLabelIndexer[T metav1.ObjectMetaAccessor](informer cache.Sh | |
if value, exists := labels[computeDomainLabelKey]; exists { | ||
return []string{value}, nil | ||
} | ||
klog.V(2).Info("No object found with ComputeDomain Label") | ||
return nil, nil | ||
}, | ||
}) | ||
|
@@ -58,6 +60,7 @@ func getByComputeDomainUID[T1 *T2, T2 any](ctx context.Context, informer cache.S | |
return nil, fmt.Errorf("error getting %T via ComputeDomain label: %w", *new(T1), err) | ||
} | ||
if len(objs) == 0 { | ||
klog.V(2).Infof("No object found with ComputeDomain Label with UID %s", cdUID) | ||
return nil, nil | ||
} | ||
|
||
|
@@ -70,5 +73,6 @@ func getByComputeDomainUID[T1 *T2, T2 any](ctx context.Context, informer cache.S | |
ds = append(ds, d) | ||
} | ||
|
||
klog.V(2).Infof("Found %d objects with ComputeDomain Label with UID %s", len(ds), cdUID) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Resulting log:
|
||
return ds, nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test: