Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wait for updated cache when patching status #245

Merged
merged 2 commits into from
Aug 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions api/v1alpha1/nodehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,14 @@ type NodeHealthCheckStatus struct {
//+optional
//+operator-sdk:csv:customresourcedefinitions:type=status,xDescriptors="urn:alm:descriptor:io.kubernetes.phase:reason"
Reason string `json:"reason,omitempty"`

// LastUpdateTime is the last time the status was updated.
//
//+optional
//+kubebuilder:validation:Type=string
//+kubebuilder:validation:Format=date-time
//+operator-sdk:csv:customresourcedefinitions:type=status
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"`
}

// UnhealthyNode defines an unhealthy node and its remediations
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ spec:
per node. Deprecated in favour of UnhealthyNodes.
displayName: In Flight Remediations
path: inFlightRemediations
- description: LastUpdateTime is the last time the status was updated.
displayName: Last Update Time
path: lastUpdateTime
- description: ObservedNodes specified the number of nodes observed by using
the NHC spec.selector
displayName: Observed Nodes
Expand Down
4 changes: 4 additions & 0 deletions bundle/manifests/remediation.medik8s.io_nodehealthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,10 @@ spec:
description: InFlightRemediations records the timestamp when remediation
triggered per node. Deprecated in favour of UnhealthyNodes.
type: object
lastUpdateTime:
description: LastUpdateTime is the last time the status was updated.
format: date-time
type: string
observedNodes:
description: ObservedNodes specified the number of nodes observed
by using the NHC spec.selector
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/remediation.medik8s.io_nodehealthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,10 @@ spec:
description: InFlightRemediations records the timestamp when remediation
triggered per node. Deprecated in favour of UnhealthyNodes.
type: object
lastUpdateTime:
description: LastUpdateTime is the last time the status was updated.
format: date-time
type: string
observedNodes:
description: ObservedNodes specified the number of nodes observed
by using the NHC spec.selector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ spec:
per node. Deprecated in favour of UnhealthyNodes.
displayName: In Flight Remediations
path: inFlightRemediations
- description: LastUpdateTime is the last time the status was updated.
displayName: Last Update Time
path: lastUpdateTime
- description: ObservedNodes specified the number of nodes observed by using
the NHC spec.selector
displayName: Observed Nodes
Expand Down
33 changes: 30 additions & 3 deletions controllers/nodehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/record"
"k8s.io/utils/pointer"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -219,7 +220,7 @@ func (r *NodeHealthCheckReconciler) Reconcile(ctx context.Context, req ctrl.Requ
if finalRequeueAfter != nil {
result.RequeueAfter = *finalRequeueAfter
}
patchErr := r.patchStatus(nhc, nhcOrig)
patchErr := r.patchStatus(nhc, nhcOrig, ctx)
if patchErr != nil {
log.Error(err, "failed to update status")
// check if we have an error from the rest of the code already
Expand Down Expand Up @@ -733,7 +734,7 @@ func (r *NodeHealthCheckReconciler) isControlPlaneRemediationAllowed(node *v1.No
return len(controlPlaneRemediationCRs) == 0, nil
}

func (r *NodeHealthCheckReconciler) patchStatus(nhc, nhcOrig *remediationv1alpha1.NodeHealthCheck) error {
func (r *NodeHealthCheckReconciler) patchStatus(nhc, nhcOrig *remediationv1alpha1.NodeHealthCheck, ctx context.Context) error {

log := utils.GetLogWithNHC(r.Log, nhc)

Expand Down Expand Up @@ -767,7 +768,33 @@ func (r *NodeHealthCheckReconciler) patchStatus(nhc, nhcOrig *remediationv1alpha
log.Info("Patching NHC status", "new status", nhc.Status, "patch", string(patchBytes))
}

return r.Client.Status().Patch(context.Background(), nhc, mergeFrom)
// only update lastUpdate when there were other changes
now := metav1.Now()
nhc.Status.LastUpdateTime = &now

if err := r.Client.Status().Patch(ctx, nhc, mergeFrom); err != nil {
return err
}

// Wait until the cache is updated in order to prevent reading a stale status in the next reconcile
// and making wrong decisions based on it. The chance to run into this is very low, because we use RequeueAfter
// with a minimum delay of 1 second everywhere instead of Requeue: true, but this needs to be fixed because
// it bypasses the controller's rate limiter!
err := wait.PollWithContext(ctx, 200*time.Millisecond, 5*time.Second, func(ctx context.Context) (bool, error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: I think I have noticed that wait.PollWithContext is deprecated and wait.PollUntilContextTimeout is advised to be used instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see that in the apimachinery version I use, but I will keep an eye on it when updating dependencies

tmpNhc := &remediationv1alpha1.NodeHealthCheck{}
if err := r.Client.Get(ctx, client.ObjectKeyFromObject(nhc), tmpNhc); err != nil {
if apierrors.IsNotFound(err) {
// nothing we can do anymore
return true, nil
}
return false, nil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Return an error?
We never return an error in these lines..

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why should I return an error here? 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

func(ctx context.Context) (bool, error) is expecting an error as the last argument that the function returns, and we haven't returned any error here. I think it would be clearer to not return an error at all and return only a bool or return at least once an error. WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the function signature is fixed, it's an argument to wait.Poll()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oho, I didn't notice that it's an argument to wait.Poll()

}
return tmpNhc.Status.LastUpdateTime != nil && (tmpNhc.Status.LastUpdateTime.Equal(nhc.Status.LastUpdateTime) || tmpNhc.Status.LastUpdateTime.After(nhc.Status.LastUpdateTime.Time)), nil
})
if err != nil {
return errors.Wrapf(err, "failed to wait for updated cache after status patch")
}
return nil
}

func (r *NodeHealthCheckReconciler) alertOldRemediationCR(remediationCR *unstructured.Unstructured) (bool, *time.Duration) {
Expand Down
17 changes: 9 additions & 8 deletions controllers/nodehealthcheck_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -612,16 +612,17 @@ var _ = Describe("Node Health Check CR", func() {
Expect(err).ToNot(HaveOccurred())

//Remediation should be removed
Eventually(
func() bool {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(cr), cr)
return errors.IsNotFound(err)
},
time.Second, time.Millisecond*100).Should(BeTrue())
Eventually(func() bool {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(cr), cr)
return errors.IsNotFound(err)
}, "2s", "100ms").Should(BeTrue(), "remediation CR wasn't removed")

//Verify NHC removed the lease
err = k8sClient.Get(context.Background(), client.ObjectKey{Name: leaseName, Namespace: leaseNs}, lease)
Expect(errors.IsNotFound(err)).To(BeTrue())
Eventually(func() bool {
err = k8sClient.Get(context.Background(), client.ObjectKey{Name: leaseName, Namespace: leaseNs}, lease)
return errors.IsNotFound(err)
}, "2s", "100ms").Should(BeTrue(), "lease wasn't removed")

})
})

Expand Down
Loading