Skip to content

Commit

Permalink
Add prometheus wpa controller reconcile and wpa valid metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
khewonc committed Mar 9, 2022
1 parent cc42350 commit 3f999fc
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 3 deletions.
53 changes: 50 additions & 3 deletions controllers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,29 @@ const (
metricNamePromLabel = "metric_name"
reasonPromLabel = "reason"
transitionPromLabel = "transition"
reconcileErrPromLabel = "reconcile_err"
// Label values
downscaleCappingPromLabelVal = "downscale_capping"
upscaleCappingPromLabelVal = "upscale_capping"
withinBoundsPromLabelVal = "within_bounds"
downscaleCappingPromLabelVal = "downscale_capping"
upscaleCappingPromLabelVal = "upscale_capping"
withinBoundsPromLabelVal = "within_bounds"
nullPromLabelVal = "null"
scaleNotFoundPromLabelVal = "scale_not_found"
invalidAPIVersionPromLabelVal = "invalid_api_version"
unknownResourcePromLabelVal = "unknown_resource"
failedUpdateReplicasPromLabelVal = "failed_update_replicas"
failedComputeReplicasPromLabelVal = "failed_compute_replicas"
failedScalePromLabelVal = "failed_scale"

promSuccessValue = 1.0
promFailureValue = 0.0
)

// reasonValues contains the 3 possible values of the 'reason' label
var reasonValues = []string{downscaleCappingPromLabelVal, upscaleCappingPromLabelVal, withinBoundsPromLabelVal}

// reconcileReasonValues contains possible `reconcile_err` label values
var reconcileReasonValues = []string{nullPromLabelVal, scaleNotFoundPromLabelVal, invalidAPIVersionPromLabelVal, unknownResourcePromLabelVal, failedUpdateReplicasPromLabelVal, failedComputeReplicasPromLabelVal, failedScalePromLabelVal}

// Labels to add to an info metric and join on (with wpaNamePromLabel) in the Datadog prometheus check
var extraPromLabels = strings.Fields(os.Getenv("DD_LABELS_AS_TAGS"))

Expand Down Expand Up @@ -199,6 +213,31 @@ var (
},
append(extraPromLabels, wpaNamePromLabel, resourceNamespacePromLabel),
)
reconcileSuccess = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: subsystem,
Name: "reconcile_success",
Help: "Gauge indicating whether the last recorded reconcile is successful",
},
[]string{
wpaNamePromLabel,
resourceNamespacePromLabel,
resourceNamePromLabel,
resourceKindPromLabel,
reconcileErrPromLabel,
})
wpaValid = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: subsystem,
Name: "wpa_valid",
Help: "Gauge indicating whether the wpa spec is valid",
},
[]string{
wpaNamePromLabel,
resourceNamespacePromLabel,
resourceNamePromLabel,
resourceKindPromLabel,
})
)

func init() {
Expand All @@ -215,6 +254,8 @@ func init() {
sigmetrics.Registry.MustRegister(replicaMax)
sigmetrics.Registry.MustRegister(dryRun)
sigmetrics.Registry.MustRegister(labelsInfo)
sigmetrics.Registry.MustRegister(reconcileSuccess)
sigmetrics.Registry.MustRegister(wpaValid)
}

func cleanupAssociatedMetrics(wpa *datadoghqv1alpha1.WatermarkPodAutoscaler, onlyMetricsSpecific bool) {
Expand Down Expand Up @@ -249,6 +290,12 @@ func cleanupAssociatedMetrics(wpa *datadoghqv1alpha1.WatermarkPodAutoscaler, onl
}
labelsInfo.Delete(promLabelsInfo)
dryRun.Delete(promLabelsForWpa)
for _, reason := range reconcileReasonValues {
promLabelsForWpa[reconcileErrPromLabel] = reason
reconcileSuccess.Delete(promLabelsForWpa)
}
delete(promLabelsForWpa, reconcileErrPromLabel)
wpaValid.Delete(promLabelsForWpa)
}

for _, metricSpec := range wpa.Spec.Metrics {
Expand Down
22 changes: 22 additions & 0 deletions controllers/watermarkpodautoscaler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,18 @@ func (r *WatermarkPodAutoscalerReconciler) Reconcile(ctx context.Context, reques
// default values of the WatermarkPodAutoscaler are set. Return and requeue to show them in the spec.
return reconcile.Result{Requeue: true}, nil
}
promLabels := prometheus.Labels{
wpaNamePromLabel: instance.Name,
resourceNamespacePromLabel: instance.Namespace,
resourceNamePromLabel: instance.Spec.ScaleTargetRef.Name,
resourceKindPromLabel: instance.Spec.ScaleTargetRef.Kind,
}
wpaValid.Delete(promLabels)
if err = datadoghqv1alpha1.CheckWPAValidity(instance); err != nil {
log.Info("Got an invalid WPA spec", "Instance", request.NamespacedName.String(), "error", err)
// If the WPA spec is incorrect (most likely, in "metrics" section) stop processing it
// When the spec is updated, the wpa will be re-added to the reconcile queue
wpaValid.With(promLabels).Set(promFailureValue)
r.eventRecorder.Event(instance, corev1.EventTypeWarning, datadoghqv1alpha1.ReasonFailedSpecCheck, err.Error())
setCondition(instance, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ReasonFailedSpecCheck, "Invalid WPA specification: %s", err)
if err = r.updateStatusIfNeeded(ctx, wpaStatusOriginal, instance); err != nil {
Expand All @@ -151,9 +159,15 @@ func (r *WatermarkPodAutoscalerReconciler) Reconcile(ctx context.Context, reques
// and if the user updates the WPA.Spec the update event will requeue the resource.
return reconcile.Result{}, nil
}
wpaValid.With(promLabels).Set(promSuccessValue)

fillMissingWatermark(log, instance)

for _, reason := range reconcileReasonValues {
promLabels[reconcileErrPromLabel] = reason
reconcileSuccess.Delete(promLabels)
}

if err := r.reconcileWPA(ctx, log, wpaStatusOriginal, instance); err != nil {
log.Info("Error during reconcileWPA", "error", err)
r.eventRecorder.Event(instance, corev1.EventTypeWarning, datadoghqv1alpha1.ReasonFailedProcessWPA, err.Error())
Expand Down Expand Up @@ -181,6 +195,7 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
// the following line are here to retrieve the GVK of the target ref
targetGV, err := schema.ParseGroupVersion(wpa.Spec.ScaleTargetRef.APIVersion)
if err != nil {
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reconcileErrPromLabel: invalidAPIVersionPromLabelVal}).Set(promFailureValue)
return fmt.Errorf("invalid API version in scale target reference: %v", err)
}
targetGK := schema.GroupKind{
Expand All @@ -189,12 +204,14 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
}
mappings, err := r.restMapper.RESTMappings(targetGK)
if err != nil {
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reconcileErrPromLabel: unknownResourcePromLabelVal}).Set(promFailureValue)
return fmt.Errorf("unable to determine resource for scale target reference: %v", err)
}

currentScale, targetGR, err := r.getScaleForResourceMappings(ctx, wpa.Namespace, wpa.Spec.ScaleTargetRef.Name, mappings)
if currentScale == nil && strings.Contains(err.Error(), scaleNotFoundErr) {
// it is possible that one of the GK in the mappings was not found, but if we have at least one that works, we can continue reconciling.
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reconcileErrPromLabel: scaleNotFoundPromLabelVal}).Set(promFailureValue)
return err
}
currentReplicas := currentScale.Status.Replicas
Expand Down Expand Up @@ -245,10 +262,12 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
r.eventRecorder.Event(wpa, corev1.EventTypeWarning, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, err2.Error())
setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, "the WPA controller was unable to update the number of replicas: %v", err)
logger.Info("The WPA controller was unable to update the number of replicas", "error", err2)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reconcileErrPromLabel: failedUpdateReplicasPromLabelVal}).Set(promFailureValue)
return nil
}
r.eventRecorder.Event(wpa, corev1.EventTypeWarning, "FailedComputeMetricsReplicas", err.Error())
logger.Info("Failed to compute desired number of replicas based on listed metrics.", "reference", reference, "error", err)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reconcileErrPromLabel: failedComputeReplicasPromLabelVal}).Set(promFailureValue)
return nil
}
logger.Info("Proposing replicas", "proposedReplicas", proposedReplicas, "metricName", metricName, "reference", reference, "metric timestamp", metricTimestamp.Format(time.RFC1123))
Expand Down Expand Up @@ -287,8 +306,10 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
if err := r.updateStatusIfNeeded(ctx, wpaStatusOriginal, wpa); err != nil {
r.eventRecorder.Event(wpa, corev1.EventTypeWarning, datadoghqv1alpha1.ReasonFailedUpdateReplicasStatus, err.Error())
setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, "the WPA controller was unable to update the number of replicas: %v", err)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reconcileErrPromLabel: failedUpdateReplicasPromLabelVal}).Set(promFailureValue)
return nil
}
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reconcileErrPromLabel: failedScalePromLabelVal}).Set(promFailureValue)
return nil
}
setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionTrue, datadoghqv1alpha1.ConditionReasonSuccessfulScale, "the WPA controller was able to update the target scale to %d", desiredReplicas)
Expand All @@ -300,6 +321,7 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
desiredReplicas = currentReplicas
}

reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reconcileErrPromLabel: nullPromLabelVal}).Set(promSuccessValue)
replicaEffective.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(float64(desiredReplicas))

// add additional labels to info metric
Expand Down

0 comments on commit 3f999fc

Please sign in to comment.