Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prometheus wpa controller reconcile and wpa valid metrics #142

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 48 additions & 3 deletions controllers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,24 @@ const (
reasonPromLabel = "reason"
transitionPromLabel = "transition"
// Label values
downscaleCappingPromLabelVal = "downscale_capping"
upscaleCappingPromLabelVal = "upscale_capping"
withinBoundsPromLabelVal = "within_bounds"
downscaleCappingPromLabelVal = "downscale_capping"
upscaleCappingPromLabelVal = "upscale_capping"
withinBoundsPromLabelVal = "within_bounds"
invalidWPAPromLabelVal = "invalid_wpa_spec"
scaleNotFoundPromLabelVal = "scale_not_found"
invalidAPIVersionPromLabelVal = "invalid_api_version"
unknownResourcePromLabelVal = "unknown_resource"
failedUpdateReplicasPromLabelVal = "failed_update_replicas"
failedComputeReplicasPromLabelVal = "failed_compute_replicas"
failedScalePromLabelVal = "failed_scale"
)

// reasonValues contains the 3 possible values of the 'reason' label
var reasonValues = []string{downscaleCappingPromLabelVal, upscaleCappingPromLabelVal, withinBoundsPromLabelVal}

// reconcileErrorReasonValues contains possible `reason` label values for reconcile errors
var reconcileErrorReasonValues = []string{invalidWPAPromLabelVal, scaleNotFoundPromLabelVal, invalidAPIVersionPromLabelVal, unknownResourcePromLabelVal, failedUpdateReplicasPromLabelVal, failedComputeReplicasPromLabelVal, failedScalePromLabelVal}

// Labels to add to an info metric and join on (with wpaNamePromLabel) in the Datadog prometheus check
var extraPromLabels = strings.Fields(os.Getenv("DD_LABELS_AS_TAGS"))

Expand Down Expand Up @@ -212,6 +222,33 @@ var (
},
append(extraPromLabels, wpaNamePromLabel, wpaNamespacePromLabel, resourceNamespacePromLabel),
)
reconcileError = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: subsystem,
Name: "reconcile_error",
Help: "Gauge indicating whether the last recorded reconcile gave an error",
},
[]string{
wpaNamePromLabel,
wpaNamespacePromLabel,
resourceNamespacePromLabel,
resourceNamePromLabel,
resourceKindPromLabel,
reasonPromLabel,
})
reconcileSuccess = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: subsystem,
Name: "reconcile_success",
Help: "Gauge indicating whether the last recorded reconcile is successful",
},
[]string{
wpaNamePromLabel,
wpaNamespacePromLabel,
resourceNamespacePromLabel,
resourceNamePromLabel,
resourceKindPromLabel,
})
)

func init() {
Expand All @@ -228,6 +265,8 @@ func init() {
sigmetrics.Registry.MustRegister(replicaMax)
sigmetrics.Registry.MustRegister(dryRun)
sigmetrics.Registry.MustRegister(labelsInfo)
sigmetrics.Registry.MustRegister(reconcileError)
sigmetrics.Registry.MustRegister(reconcileSuccess)
}

func cleanupAssociatedMetrics(wpa *datadoghqv1alpha1.WatermarkPodAutoscaler, onlyMetricsSpecific bool) {
Expand Down Expand Up @@ -263,6 +302,12 @@ func cleanupAssociatedMetrics(wpa *datadoghqv1alpha1.WatermarkPodAutoscaler, onl
}
labelsInfo.Delete(promLabelsInfo)
dryRun.Delete(promLabelsForWpa)
for _, reason := range reconcileErrorReasonValues {
promLabelsForWpa[reasonPromLabel] = reason
reconcileError.Delete(promLabelsForWpa)
}
delete(promLabelsForWpa, reasonPromLabel)
reconcileSuccess.Delete(promLabelsForWpa)
}

for _, metricSpec := range wpa.Spec.Metrics {
Expand Down
28 changes: 28 additions & 0 deletions controllers/watermarkpodautoscaler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,23 @@ func (r *WatermarkPodAutoscalerReconciler) Reconcile(ctx context.Context, reques
// default values of the WatermarkPodAutoscaler are set. Return and requeue to show them in the spec.
return reconcile.Result{Requeue: true}, nil
}
promLabels := prometheus.Labels{
wpaNamePromLabel: instance.Name,
wpaNamespacePromLabel: instance.Namespace,
resourceNamespacePromLabel: instance.Namespace,
resourceNamePromLabel: instance.Spec.ScaleTargetRef.Name,
resourceKindPromLabel: instance.Spec.ScaleTargetRef.Kind,
}
for _, reason := range reconcileErrorReasonValues {
promLabels[reasonPromLabel] = reason
reconcileError.Delete(promLabels)
}
if err = datadoghqv1alpha1.CheckWPAValidity(instance); err != nil {
log.Info("Got an invalid WPA spec", "Instance", request.NamespacedName.String(), "error", err)
// If the WPA spec is incorrect (most likely, in "metrics" section) stop processing it
// When the spec is updated, the wpa will be re-added to the reconcile queue
reconcileError.With(prometheus.Labels{wpaNamePromLabel: instance.Name, wpaNamespacePromLabel: instance.Namespace, resourceNamespacePromLabel: instance.Namespace, resourceNamePromLabel: instance.Spec.ScaleTargetRef.Name, resourceKindPromLabel: instance.Spec.ScaleTargetRef.Kind, reasonPromLabel: invalidWPAPromLabelVal}).Set(1)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: instance.Name, wpaNamespacePromLabel: instance.Namespace, resourceNamespacePromLabel: instance.Namespace, resourceNamePromLabel: instance.Spec.ScaleTargetRef.Name, resourceKindPromLabel: instance.Spec.ScaleTargetRef.Kind}).Set(0)
r.eventRecorder.Event(instance, corev1.EventTypeWarning, datadoghqv1alpha1.ReasonFailedSpecCheck, err.Error())
setCondition(instance, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ReasonFailedSpecCheck, "Invalid WPA specification: %s", err)
if err = r.updateStatusIfNeeded(ctx, wpaStatusOriginal, instance); err != nil {
Expand Down Expand Up @@ -184,6 +197,8 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
// the following line are here to retrieve the GVK of the target ref
targetGV, err := schema.ParseGroupVersion(wpa.Spec.ScaleTargetRef.APIVersion)
if err != nil {
reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: invalidAPIVersionPromLabelVal}).Set(1)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0)
return fmt.Errorf("invalid API version in scale target reference: %v", err)
}
targetGK := schema.GroupKind{
Expand All @@ -192,12 +207,16 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
}
mappings, err := r.restMapper.RESTMappings(targetGK)
if err != nil {
reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: unknownResourcePromLabelVal}).Set(1)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0)
return fmt.Errorf("unable to determine resource for scale target reference: %v", err)
}

currentScale, targetGR, err := r.getScaleForResourceMappings(ctx, wpa.Namespace, wpa.Spec.ScaleTargetRef.Name, mappings)
if currentScale == nil && strings.Contains(err.Error(), scaleNotFoundErr) {
// it is possible that one of the GK in the mappings was not found, but if we have at least one that works, we can continue reconciling.
reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: scaleNotFoundPromLabelVal}).Set(1)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0)
return err
}
currentReplicas := currentScale.Status.Replicas
Expand Down Expand Up @@ -248,10 +267,14 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
r.eventRecorder.Event(wpa, corev1.EventTypeWarning, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, err2.Error())
setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, "the WPA controller was unable to update the number of replicas: %v", err)
logger.Info("The WPA controller was unable to update the number of replicas", "error", err2)
reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: failedUpdateReplicasPromLabelVal}).Set(1)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0)
return nil
}
r.eventRecorder.Event(wpa, corev1.EventTypeWarning, "FailedComputeMetricsReplicas", err.Error())
logger.Info("Failed to compute desired number of replicas based on listed metrics.", "reference", reference, "error", err)
reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: failedComputeReplicasPromLabelVal}).Set(1)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0)
return nil
}
logger.Info("Proposing replicas", "proposedReplicas", proposedReplicas, "metricName", metricName, "reference", reference, "metric timestamp", metricTimestamp.Format(time.RFC1123))
Expand Down Expand Up @@ -290,8 +313,12 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
if err := r.updateStatusIfNeeded(ctx, wpaStatusOriginal, wpa); err != nil {
r.eventRecorder.Event(wpa, corev1.EventTypeWarning, datadoghqv1alpha1.ReasonFailedUpdateReplicasStatus, err.Error())
setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, "the WPA controller was unable to update the number of replicas: %v", err)
reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: failedUpdateReplicasPromLabelVal}).Set(1)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0)
return nil
}
reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: failedScalePromLabelVal}).Set(1)
reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0)
return nil
}
setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionTrue, datadoghqv1alpha1.ConditionReasonSuccessfulScale, "the WPA controller was able to update the target scale to %d", desiredReplicas)
Expand All @@ -303,6 +330,7 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log
desiredReplicas = currentReplicas
}

reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(1)
replicaEffective.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(float64(desiredReplicas))

// add additional labels to info metric
Expand Down