Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion cmd/compute-domain-controller/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type CleanupManager[T metav1.Object] struct {
}

func NewCleanupManager[T metav1.Object](informer cache.SharedIndexInformer, getComputeDomain GetComputeDomainFunc, callback CleanupCallback[T]) *CleanupManager[T] {
klog.Infof("Creating new Cleanup Manager for %T", *new(T))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test:

I0331 20:49:32.416970       1 cleanup.go:29] Creating new Cleanup Manager for *v1beta1.ResourceClaimTemplate
I0331 20:49:32.416975       1 cleanup.go:29] Creating new Cleanup Manager for *v1.DaemonSet

return &CleanupManager[T]{
informer: informer,
getComputeDomain: getComputeDomain,
Expand Down Expand Up @@ -61,19 +62,25 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) {
case <-ticker.C:
klog.V(6).Infof("Running periodic sync to remove %T objects owned by stale ComputeDomain", *new(T))
store := m.informer.GetStore()
for _, item := range store.List() {
items := store.List()
klog.V(6).Infof("Found %d items to check for cleanup", len(items))

for _, item := range items {
obj, ok := item.(T)
if !ok {
klog.V(6).Infof("Expected object %T but got %T, skipping..", *new(T), obj)
continue
}

labels := obj.GetLabels()
if labels == nil {
klog.V(6).Infof("Object %T has no labels, skipping..", *new(T))
continue
}

uid, exists := labels[computeDomainLabelKey]
if !exists {
klog.V(6).Infof("Object %T does not have ComputeDomain label, skipping..", *new(T))
continue
}

Expand All @@ -84,6 +91,7 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) {
}

if computeDomain != nil {
klog.V(6).Infof("ComputeDomain with UID %s still exists, skipping cleanup", uid)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found 1 items to check for cleanup
I0401 20:39:32.718643       1 cleanup.go:94] ComputeDomain with UID 2577168b-75d1-4c51-a659-d2225e2fe24f still exists, skipping cleanup

continue
}

Expand Down
26 changes: 23 additions & 3 deletions cmd/compute-domain-controller/computedomain.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"sync"
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -64,11 +65,13 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager {
factory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, informerResyncPeriod)
informer := factory.Resource().V1beta1().ComputeDomains().Informer()

klog.Infof("Creating new ComputeDomainManager for %s/%s", config.driverName, config.driverNamespace)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test:

I0331 20:49:32.416920       1 computedomain.go:68] Creating new ComputeDomainManager for compute-domain.nvidia.com/nvidia-dra-driver-gpu

m := &ComputeDomainManager{
config: config,
factory: factory,
informer: informer,
}
// TODO (swati) add logs for daemonset and resourceClaimTemplate managers in verbose mode
m.daemonSetManager = NewDaemonSetManager(config, m.Get)
m.resourceClaimTemplateManager = NewWorkloadResourceClaimTemplateManager(config, m.Get)

Expand Down Expand Up @@ -147,6 +150,7 @@ func (m *ComputeDomainManager) Get(uid string) (*nvapi.ComputeDomain, error) {
return nil, fmt.Errorf("error retrieving ComputeDomain by UID: %w", err)
}
if len(cds) == 0 {
klog.V(2).Infof("No ComputeDomain found with UID: %s", uid)
return nil, nil
}
if len(cds) != 1 {
Expand All @@ -166,11 +170,12 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string)
return fmt.Errorf("error retrieving ComputeDomain: %w", err)
}
if cd == nil {
klog.V(2).Infof("ComputeDomain with UID %s not found, nothing to do", uid)
return nil
}

if cd.GetDeletionTimestamp() == nil {
return fmt.Errorf("attempting to remove finalizer before ComputeDomain marked for deletion")
return fmt.Errorf("attempting to remove finalizer before ComputeDomain %s/%s with UID %s marked for deletion", cd.Namespace, cd.Name, uid)
}

newCD := cd.DeepCopy()
Expand All @@ -181,6 +186,7 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string)
}
}
if len(cd.Finalizers) == len(newCD.Finalizers) {
klog.V(2).Infof("Finalizer not found on ComputeDomain %s/%s, nothing to do", cd.Namespace, cd.Name)
return nil
}

Expand All @@ -191,6 +197,20 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string)
return nil
}

// logNodesWithComputeDomainLabel logs nodes that have a ComputeDomain label and returns their names.
func (m *ComputeDomainManager) logNodesWithComputeDomainLabel(nodes *corev1.NodeList, cdUID string) []string {
if len(nodes.Items) == 0 {
klog.Infof("No nodes found with label for ComputeDomain with UID %s", cdUID)
return nil
}

nodeNames := []string{}
for _, node := range nodes.Items {
nodeNames = append(nodeNames, node.Name)
}
return nodeNames
}

// AssertWorkloadsCompletes ensures that all workloads asssociated with a ComputeDomain have completed.
//
// TODO: We should probably also check to ensure that all ResourceClaims
Expand All @@ -215,9 +235,9 @@ func (m *ComputeDomainManager) AssertWorkloadsCompleted(ctx context.Context, cdU
}

if len(nodes.Items) != 0 {
return fmt.Errorf("nodes exist with label for ComputeDomain %s", cdUID)
nodeNames := m.logNodesWithComputeDomainLabel(nodes, cdUID)
return fmt.Errorf("nodes %v with label for ComputeDomain %s", nodeNames, cdUID)
}

return nil
}

Expand Down
5 changes: 5 additions & 0 deletions cmd/compute-domain-controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"context"
"fmt"

"k8s.io/klog/v2"

"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/workqueue"
)
Expand Down Expand Up @@ -57,6 +59,8 @@ func NewController(config *Config) *Controller {
// It initializes the work queue, starts the ComputeDomain manager, and handles
// graceful shutdown when the context is cancelled.
func (c *Controller) Run(ctx context.Context) error {
klog.Info("Starting ComputeDomain Controller")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test:

I0331 20:49:32.416860       1 controller.go:62] Starting ComputeDomain Controller

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test:

I0331 20:49:32.416860       1 controller.go:62] Starting ComputeDomain Controller


workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter())

managerConfig := &ManagerConfig{
Expand All @@ -78,5 +82,6 @@ func (c *Controller) Run(ctx context.Context) error {
return fmt.Errorf("error stopping ComputeDomain manager: %w", err)
}

klog.Info("ComputeDomain Controller is shutdown")
return nil
}
13 changes: 11 additions & 2 deletions cmd/compute-domain-controller/daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ func NewDaemonSetManager(config *ManagerConfig, getComputeDomain GetComputeDomai

informer := factory.Apps().V1().DaemonSets().Informer()

klog.Infof("Creating new DaemonSetManager for driver %s/%s", config.driverNamespace, config.driverName)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test log:

I0331 20:49:32.416950       1 daemonset.go:89] Creating new DaemonSetManager for driver nvidia-dra-driver-gpu/compute-domain.nvidia.com

m := &DaemonSetManager{
config: config,
getComputeDomain: getComputeDomain,
Expand Down Expand Up @@ -162,7 +163,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva
return nil, fmt.Errorf("error retrieving DaemonSet: %w", err)
}
if len(ds) > 1 {
return nil, fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID")
return nil, fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID %s", cd.UID)
}
if len(ds) == 1 {
return ds[0], nil
Expand Down Expand Up @@ -209,6 +210,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva
return nil, fmt.Errorf("error creating DaemonSet: %w", err)
}

klog.V(2).Infof("Successfully created DaemonSet %s/%s for ComputeDomain %s/%s", d.Namespace, d.Name, cd.Namespace, cd.Name)
return d, nil
}

Expand All @@ -218,9 +220,10 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error {
return fmt.Errorf("error retrieving DaemonSet: %w", err)
}
if len(ds) > 1 {
return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID")
return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID %s", cdUID)
}
if len(ds) == 0 {
klog.V(2).Infof("No DaemonSet found for ComputeDomain UID %s, nothing to delete", cdUID)
return nil
}

Expand All @@ -231,6 +234,7 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error {
}

if d.GetDeletionTimestamp() != nil {
klog.V(2).Infof("DaemonSet %s/%s is already marked for deletion", d.Namespace, d.Name)
return nil
}

Expand All @@ -239,6 +243,7 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error {
return fmt.Errorf("erroring deleting DaemonSet: %w", err)
}

klog.V(2).Infof("Successfully deleted DaemonSet %s/%s for ComputeDomain UID %s", d.Namespace, d.Name, cdUID)
return nil
}

Expand Down Expand Up @@ -271,6 +276,7 @@ func (m *DaemonSetManager) removeFinalizer(ctx context.Context, cdUID string) er
return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID")
}
if len(ds) == 0 {
klog.V(2).Infof("No DaemonSet found for ComputeDomain UID %s, nothing to remove finalizer from", cdUID)
return nil
}

Expand All @@ -288,6 +294,7 @@ func (m *DaemonSetManager) removeFinalizer(ctx context.Context, cdUID string) er
}
}
if len(d.Finalizers) == len(newD.Finalizers) {
klog.V(2).Infof("Finalizer %s not found on DaemonSet %s/%s", computeDomainFinalizer, d.Namespace, d.Name)
return nil
}

Expand Down Expand Up @@ -322,10 +329,12 @@ func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error {
return fmt.Errorf("error getting ComputeDomain: %w", err)
}
if cd == nil {
klog.V(2).Info("No ComputeDomain found, skipping processing")
return nil
}

if int(d.Status.NumberReady) != cd.Spec.NumNodes {
klog.V(2).Infof("DaemonSet %s/%s has %d ready nodes, expecting %d, waiting for all nodes to be ready", d.Namespace, d.Name, d.Status.NumberReady, cd.Spec.NumNodes)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test log:

I0331 20:49:32.819099       1 daemonset.go:337] DaemonSet nvidia-dra-driver-gpu/nvbandwidth-test-compute-domain-trgmx has 0 ready nodes, expecting 4, waiting for all nodes to be ready

return nil
}

Expand Down
4 changes: 4 additions & 0 deletions cmd/compute-domain-controller/indexers.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"
)

func uidIndexer[T metav1.ObjectMetaAccessor](obj any) ([]string, error) {
Expand All @@ -43,6 +44,7 @@ func addComputeDomainLabelIndexer[T metav1.ObjectMetaAccessor](informer cache.Sh
if value, exists := labels[computeDomainLabelKey]; exists {
return []string{value}, nil
}
klog.V(2).Info("No object found with ComputeDomain Label")
return nil, nil
},
})
Expand All @@ -58,6 +60,7 @@ func getByComputeDomainUID[T1 *T2, T2 any](ctx context.Context, informer cache.S
return nil, fmt.Errorf("error getting %T via ComputeDomain label: %w", *new(T1), err)
}
if len(objs) == 0 {
klog.V(2).Infof("No object found with ComputeDomain Label with UID %s", cdUID)
return nil, nil
}

Expand All @@ -70,5 +73,6 @@ func getByComputeDomainUID[T1 *T2, T2 any](ctx context.Context, informer cache.S
ds = append(ds, d)
}

klog.V(2).Infof("Found %d objects with ComputeDomain Label with UID %s", len(ds), cdUID)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resulting log:

I0331 20:49:32.819074       1 indexers.go:76] Found 1 objects with ComputeDomain Label with UID 2577168b-75d1-4c51-a659-d2225e2fe24f
I0331 20:49:32.819083       1 indexers.go:76] Found 1 objects with ComputeDomain Label with UID 2577168b-75d1-4c51-a659-d2225e2fe24f

return ds, nil
}
10 changes: 10 additions & 0 deletions cmd/compute-domain-controller/resourceclaimtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func newBaseResourceClaimTemplateManager(config *ManagerConfig, getComputeDomain

informer := factory.Resource().V1beta1().ResourceClaimTemplates().Informer()

klog.Infof("Creating new ResourceClaimTemplateManager for driver %s/%s", config.driverNamespace, config.driverName)
m := &BaseResourceClaimTemplateManager{
config: config,
getComputeDomain: getComputeDomain,
Expand Down Expand Up @@ -167,6 +168,7 @@ func (m *BaseResourceClaimTemplateManager) Create(ctx context.Context, templateP
return nil, fmt.Errorf("error creating ResourceClaimTemplate: %w", err)
}

klog.V(2).Infof("Successfully created ResourceClaimTemplate %s/%s for ComputeDomain %s/%s", rct.Namespace, rct.Name, m.config.driverNamespace, m.config.driverName)
return rct, nil
}

Expand All @@ -179,12 +181,14 @@ func (m *BaseResourceClaimTemplateManager) Delete(ctx context.Context, cdUID str
return fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID")
}
if len(rcts) == 0 {
klog.V(2).Infof("No ResourceClaimTemplate found for ComputeDomain UID %s, nothing to delete", cdUID)
return nil
}

rct := rcts[0]

if rct.GetDeletionTimestamp() != nil {
klog.V(2).Infof("ResourceClaimTemplate %s/%s is already marked for deletion", rct.Namespace, rct.Name)
return nil
}

Expand All @@ -193,6 +197,7 @@ func (m *BaseResourceClaimTemplateManager) Delete(ctx context.Context, cdUID str
return fmt.Errorf("erroring deleting ResourceClaimTemplate: %w", err)
}

klog.V(2).Infof("Successfully deleted ResourceClaimTemplate %s/%s for ComputeDomain UID %s", rct.Namespace, rct.Name, cdUID)
return nil
}

Expand All @@ -205,6 +210,7 @@ func (m *BaseResourceClaimTemplateManager) RemoveFinalizer(ctx context.Context,
return fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID")
}
if len(rcts) == 0 {
klog.V(2).Infof("No ResourceClaimTemplate found for ComputeDomain UID %s, nothing to remove finalizer from", cdUID)
return nil
}

Expand All @@ -222,6 +228,7 @@ func (m *BaseResourceClaimTemplateManager) RemoveFinalizer(ctx context.Context,
}
}
if len(rct.Finalizers) == len(newRCT.Finalizers) {
klog.V(2).Infof("Finalizer %s not found on DaemonSet %s/%s", computeDomainFinalizer, rct.Namespace, rct.Name)
return nil
}

Expand Down Expand Up @@ -286,6 +293,7 @@ func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, name
return nil, fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID")
}
if len(rcts) == 1 {
klog.V(2).Infof("Found ResourceClaimTemplate %s/%s for ComputeDomain UID %s", rcts[0].Namespace, rcts[0].Name, cd.UID)
return rcts[0], nil
}

Expand All @@ -310,6 +318,7 @@ func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, name
return nil, fmt.Errorf("error creating ResourceClaimTemplate from base: %w", err)
}

klog.V(2).Infof("Successfully created ResourceClaimTemplate from base %s/%s for ComputeDomain %s/%s", rct.Namespace, rct.Name, cd.Namespace, cd.Name)
return rct, nil
}

Expand Down Expand Up @@ -346,6 +355,7 @@ func (m *WorkloadResourceClaimTemplateManager) Create(ctx context.Context, names
return nil, fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID")
}
if len(rcts) == 1 {
klog.V(2).Infof("Found ResourceClaimTemplate %s/%s for ComputeDomain UID %s", rcts[0].Namespace, rcts[0].Name, cd.UID)
return rcts[0], nil
}

Expand Down