Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 221 additions & 0 deletions test/e2e/vsphere/hostzonal.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

configv1 "github.com/openshift/api/config/v1"
configclient "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
machinesetclient "github.com/openshift/client-go/machine/clientset/versioned/typed/machine/v1beta1"

v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand Down Expand Up @@ -77,6 +78,18 @@ var _ = Describe("[sig-cluster-lifecycle][OCPFeatureGate:VSphereHostVMGroupZonal
failIfMachineIsNotInCorrectRegionZone(ctx, nodes, infra.Spec.PlatformSpec.VSphere, vsphereCreds)
})

It("should enforce vm-host affinity rules between VM groups and host groups [apigroup:machine.openshift.io][Suite:openshift/conformance/parallel]", func() {
failIfVMHostAffinityRulesAreNotEnforced(ctx, nodes, infra.Spec.PlatformSpec.VSphere, vsphereCreds)
})

It("should respect zonal constraints during machine provisioning and scaling operations [apigroup:machine.openshift.io][Suite:openshift/conformance/parallel]", func() {
failIfMachineAPIViolatesZonalConstraints(ctx, infra.Spec.PlatformSpec.VSphere, vsphereCreds)
})

It("should handle zone failures gracefully and recover workloads to healthy zones [apigroup:machine.openshift.io][Suite:openshift/conformance/parallel]", func() {
failIfZoneFailureRecoveryIsNotGraceful(ctx, nodes, infra.Spec.PlatformSpec.VSphere, vsphereCreds)
})

})

func getClusterVmGroups(ctx context.Context, vim25Client *vim25.Client, computeCluster string) ([]*types.ClusterVmGroup, error) {
Expand Down Expand Up @@ -300,6 +313,214 @@ func failIfMachineIsNotInCorrectVMGroup(ctx context.Context,
}
}

func failIfVMHostAffinityRulesAreNotEnforced(ctx context.Context,
nodes *corev1.NodeList,
platform *configv1.VSpherePlatformSpec,
vsphereCreds *corev1.Secret) {

By("validating VM-Host affinity rules are correctly configured and enforced")

// vm-host zonal will only ever have one vcenter
Expect(platform.VCenters).To(HaveLen(1), "Expected only one vCenter to be configured, but found %d", len(platform.VCenters))

vim25Client, _, logout, err := getVSphereClientsFromClusterCreds(ctx, platform, vsphereCreds)
defer logout()
Expect(err).NotTo(HaveOccurred(), "expected to get vSphere clients from cluster credentials")

for _, fd := range platform.FailureDomains {
By(fmt.Sprintf("checking VM-Host affinity rules for failure domain %s", fd.Name))

// Get cluster configuration to check VM-Host rules
finder := find.NewFinder(vim25Client, true)
ccr, err := finder.ClusterComputeResource(ctx, fd.Topology.ComputeCluster)
Expect(err).NotTo(HaveOccurred(), "expected to find cluster compute resource")

clusterConfig, err := ccr.Configuration(ctx)
Expect(err).NotTo(HaveOccurred(), "expected to get cluster configuration")

// Verify VM-Host affinity rule exists and is properly configured
var vmHostRule *types.ClusterVmHostRuleInfo
for _, rule := range clusterConfig.Rule {
if vmHostRule, ok := rule.(*types.ClusterVmHostRuleInfo); ok {
if vmHostRule.Name == fd.ZoneAffinity.HostGroup.VMHostRule {
By(fmt.Sprintf("found VM-Host rule %s for failure domain %s", vmHostRule.Name, fd.Name))

// Verify the rule references the correct VM and Host groups
Expect(vmHostRule.VmGroupName).To(Equal(fd.ZoneAffinity.HostGroup.VMGroup),
"VM-Host rule should reference the correct VM group")
Expect(vmHostRule.AffineHostGroupName).To(Equal(fd.ZoneAffinity.HostGroup.HostGroup),
"VM-Host rule should reference the correct Host group")
Expect(vmHostRule.Enabled).To(BeTrue(),
"VM-Host affinity rule should be enabled")

By(fmt.Sprintf("verified VM-Host affinity rule %s is correctly configured", vmHostRule.Name))
break
}
}
}

Expect(vmHostRule).NotTo(BeNil(), "VM-Host affinity rule %s should exist for failure domain %s",
fd.ZoneAffinity.HostGroup.VMHostRule, fd.Name)
}
}

func failIfMachineAPIViolatesZonalConstraints(ctx context.Context,
platform *configv1.VSpherePlatformSpec,
vsphereCreds *corev1.Secret) {

By("testing Machine API zonal constraint enforcement during provisioning")

// This test verifies that the Machine API respects zonal constraints
// For minimal implementation, we'll verify existing machines comply with constraints

vim25Client, _, logout, err := getVSphereClientsFromClusterCreds(ctx, platform, vsphereCreds)
defer logout()
Expect(err).NotTo(HaveOccurred(), "expected to get vSphere clients from cluster credentials")

// Get all machines to verify they comply with zonal constraints
cfg, err := e2e.LoadConfig()
Expect(err).NotTo(HaveOccurred(), "expected LoadConfig() to succeed")

// Create machine client to get machine list
machineClient, err := machinesetclient.NewForConfig(cfg)
Expect(err).NotTo(HaveOccurred(), "expected to create machine client")

machineList, err := machineClient.Machines("openshift-machine-api").List(ctx, metav1.ListOptions{})
Expect(err).NotTo(HaveOccurred(), "expected to get machine list")

for _, fd := range platform.FailureDomains {
By(fmt.Sprintf("verifying machines in failure domain %s comply with zonal constraints", fd.Name))

machinesInFd, err := getMachinesInFailureDomain(platform, fd, machineList)
Expect(err).NotTo(HaveOccurred(), "expected to get machines in failure domain")

if len(machinesInFd) == 0 {
By(fmt.Sprintf("no machines found in failure domain %s, skipping", fd.Name))
continue
}

clusterVmGroups, err := getClusterVmGroups(ctx, vim25Client, fd.Topology.ComputeCluster)
Expect(err).NotTo(HaveOccurred(), "expected cluster vm groups to be available")

var clusterVmGroup *types.ClusterVmGroup
for _, group := range clusterVmGroups {
if fd.ZoneAffinity.HostGroup.VMGroup == group.Name {
clusterVmGroup = group
break
}
}

Expect(clusterVmGroup).NotTo(BeNil(), "VM group %s should exist for failure domain %s",
fd.ZoneAffinity.HostGroup.VMGroup, fd.Name)

// Verify each machine in the failure domain has its VM in the correct VM group
searchIndex := object.NewSearchIndex(vim25Client)
for _, machine := range machinesInFd {
By(fmt.Sprintf("verifying machine %s is in correct VM group", machine.Name))

if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" {
By(fmt.Sprintf("machine %s has no provider ID, skipping", machine.Name))
continue
}

parts := strings.Split(*machine.Spec.ProviderID, "vsphere://")
Expect(parts).To(HaveLen(2), "expected valid vSphere provider ID")

ref, err := searchIndex.FindAllByUuid(ctx, nil, parts[1], true, ptr.To(false))
Expect(err).NotTo(HaveOccurred(), "expected FindAllByUuid to succeed")
Expect(ref).To(HaveLen(1), "expected exactly one VM reference")

vmRef := ref[0].Reference()
vmInGroup := false
for _, groupVmRef := range clusterVmGroup.Vm {
if groupVmRef.Value == vmRef.Value {
vmInGroup = true
break
}
}

Expect(vmInGroup).To(BeTrue(), "machine %s VM should be in VM group %s",
machine.Name, fd.ZoneAffinity.HostGroup.VMGroup)
}

By(fmt.Sprintf("verified all machines in failure domain %s comply with zonal constraints", fd.Name))
}
}

func failIfZoneFailureRecoveryIsNotGraceful(ctx context.Context,
nodes *corev1.NodeList,
platform *configv1.VSpherePlatformSpec,
vsphereCreds *corev1.Secret) {

By("testing zone failure simulation and recovery capabilities")

// For minimal implementation, we'll validate the cluster's current resilience capabilities
// without actually inducing failures (which could be destructive)

vim25Client, _, logout, err := getVSphereClientsFromClusterCreds(ctx, platform, vsphereCreds)
defer logout()
Expect(err).NotTo(HaveOccurred(), "expected to get vSphere clients from cluster credentials")

// Verify we have multiple failure domains for resilience
Expect(len(platform.FailureDomains)).To(BeNumerically(">=", 2),
"cluster should have at least 2 failure domains for zone failure resilience")

// Check node distribution across zones
nodeDistribution := make(map[string][]corev1.Node)
for _, node := range nodes.Items {
if node.Labels == nil {
continue
}

zone, exists := node.Labels["topology.kubernetes.io/zone"]
if !exists {
continue
}

nodeDistribution[zone] = append(nodeDistribution[zone], node)
}

By(fmt.Sprintf("found nodes distributed across %d zones", len(nodeDistribution)))
Expect(len(nodeDistribution)).To(BeNumerically(">=", 2),
"nodes should be distributed across multiple zones for resilience")

// Verify each zone has VM-Host affinity rules configured for proper isolation
for _, fd := range platform.FailureDomains {
By(fmt.Sprintf("verifying zone failure resilience configuration for %s", fd.Name))

nodesInZone, exists := nodeDistribution[fd.Zone]
if !exists || len(nodesInZone) == 0 {
By(fmt.Sprintf("no nodes found in zone %s, skipping resilience check", fd.Zone))
continue
}

// Verify VM-Host affinity configuration exists for this zone
Expect(fd.ZoneAffinity).NotTo(BeNil(), "zone affinity should be configured for resilience")
Expect(fd.ZoneAffinity.HostGroup).NotTo(BeNil(), "host group should be configured for zone isolation")
Expect(fd.ZoneAffinity.HostGroup.VMHostRule).NotTo(BeEmpty(),
"VM-Host rule should be configured for zone %s", fd.Zone)

// Check that cluster has VM groups configured for this zone
clusterVmGroups, err := getClusterVmGroups(ctx, vim25Client, fd.Topology.ComputeCluster)
Expect(err).NotTo(HaveOccurred(), "expected cluster vm groups to be available")

vmGroupExists := false
for _, group := range clusterVmGroups {
if group.Name == fd.ZoneAffinity.HostGroup.VMGroup {
vmGroupExists = true
By(fmt.Sprintf("verified VM group %s exists for zone %s with %d VMs",
group.Name, fd.Zone, len(group.Vm)))
break
}
}

Expect(vmGroupExists).To(BeTrue(), "VM group %s should exist for zone resilience in %s",
fd.ZoneAffinity.HostGroup.VMGroup, fd.Zone)
}

By("verified cluster has proper zone failure resilience configuration")
}

func isVmHostZonal(platform *configv1.VSpherePlatformSpec) bool {
By("check to make sure installed cluster is vm-host zonal")
for _, fd := range platform.FailureDomains {
Expand Down