diff --git a/controllers/metrics.go b/controllers/metrics.go index 8c8b33a5..02f5dc49 100644 --- a/controllers/metrics.go +++ b/controllers/metrics.go @@ -28,14 +28,24 @@ const ( reasonPromLabel = "reason" transitionPromLabel = "transition" // Label values - downscaleCappingPromLabelVal = "downscale_capping" - upscaleCappingPromLabelVal = "upscale_capping" - withinBoundsPromLabelVal = "within_bounds" + downscaleCappingPromLabelVal = "downscale_capping" + upscaleCappingPromLabelVal = "upscale_capping" + withinBoundsPromLabelVal = "within_bounds" + invalidWPAPromLabelVal = "invalid_wpa_spec" + scaleNotFoundPromLabelVal = "scale_not_found" + invalidAPIVersionPromLabelVal = "invalid_api_version" + unknownResourcePromLabelVal = "unknown_resource" + failedUpdateReplicasPromLabelVal = "failed_update_replicas" + failedComputeReplicasPromLabelVal = "failed_compute_replicas" + failedScalePromLabelVal = "failed_scale" ) // reasonValues contains the 3 possible values of the 'reason' label var reasonValues = []string{downscaleCappingPromLabelVal, upscaleCappingPromLabelVal, withinBoundsPromLabelVal} +// reconcileErrorReasonValues contains possible `reason` label values for reconcile errors +var reconcileErrorReasonValues = []string{invalidWPAPromLabelVal, scaleNotFoundPromLabelVal, invalidAPIVersionPromLabelVal, unknownResourcePromLabelVal, failedUpdateReplicasPromLabelVal, failedComputeReplicasPromLabelVal, failedScalePromLabelVal} + // Labels to add to an info metric and join on (with wpaNamePromLabel) in the Datadog prometheus check var extraPromLabels = strings.Fields(os.Getenv("DD_LABELS_AS_TAGS")) @@ -212,6 +222,33 @@ var ( }, append(extraPromLabels, wpaNamePromLabel, wpaNamespacePromLabel, resourceNamespacePromLabel), ) + reconcileError = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: subsystem, + Name: "reconcile_error", + Help: "Gauge indicating whether the last recorded reconcile gave an error", + }, + []string{ + wpaNamePromLabel, + wpaNamespacePromLabel, + resourceNamespacePromLabel, + resourceNamePromLabel, + resourceKindPromLabel, + reasonPromLabel, + }) + reconcileSuccess = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: subsystem, + Name: "reconcile_success", + Help: "Gauge indicating whether the last recorded reconcile is successful", + }, + []string{ + wpaNamePromLabel, + wpaNamespacePromLabel, + resourceNamespacePromLabel, + resourceNamePromLabel, + resourceKindPromLabel, + }) ) func init() { @@ -228,6 +265,8 @@ func init() { sigmetrics.Registry.MustRegister(replicaMax) sigmetrics.Registry.MustRegister(dryRun) sigmetrics.Registry.MustRegister(labelsInfo) + sigmetrics.Registry.MustRegister(reconcileError) + sigmetrics.Registry.MustRegister(reconcileSuccess) } func cleanupAssociatedMetrics(wpa *datadoghqv1alpha1.WatermarkPodAutoscaler, onlyMetricsSpecific bool) { @@ -263,6 +302,12 @@ func cleanupAssociatedMetrics(wpa *datadoghqv1alpha1.WatermarkPodAutoscaler, onl } labelsInfo.Delete(promLabelsInfo) dryRun.Delete(promLabelsForWpa) + for _, reason := range reconcileErrorReasonValues { + promLabelsForWpa[reasonPromLabel] = reason + reconcileError.Delete(promLabelsForWpa) + } + delete(promLabelsForWpa, reasonPromLabel) + reconcileSuccess.Delete(promLabelsForWpa) } for _, metricSpec := range wpa.Spec.Metrics { diff --git a/controllers/watermarkpodautoscaler_controller.go b/controllers/watermarkpodautoscaler_controller.go index f141d48d..9a510e55 100644 --- a/controllers/watermarkpodautoscaler_controller.go +++ b/controllers/watermarkpodautoscaler_controller.go @@ -140,10 +140,23 @@ func (r *WatermarkPodAutoscalerReconciler) Reconcile(ctx context.Context, reques // default values of the WatermarkPodAutoscaler are set. Return and requeue to show them in the spec. return reconcile.Result{Requeue: true}, nil } + promLabels := prometheus.Labels{ + wpaNamePromLabel: instance.Name, + wpaNamespacePromLabel: instance.Namespace, + resourceNamespacePromLabel: instance.Namespace, + resourceNamePromLabel: instance.Spec.ScaleTargetRef.Name, + resourceKindPromLabel: instance.Spec.ScaleTargetRef.Kind, + } + for _, reason := range reconcileErrorReasonValues { + promLabels[reasonPromLabel] = reason + reconcileError.Delete(promLabels) + } if err = datadoghqv1alpha1.CheckWPAValidity(instance); err != nil { log.Info("Got an invalid WPA spec", "Instance", request.NamespacedName.String(), "error", err) // If the WPA spec is incorrect (most likely, in "metrics" section) stop processing it // When the spec is updated, the wpa will be re-added to the reconcile queue + reconcileError.With(prometheus.Labels{wpaNamePromLabel: instance.Name, wpaNamespacePromLabel: instance.Namespace, resourceNamespacePromLabel: instance.Namespace, resourceNamePromLabel: instance.Spec.ScaleTargetRef.Name, resourceKindPromLabel: instance.Spec.ScaleTargetRef.Kind, reasonPromLabel: invalidWPAPromLabelVal}).Set(1) + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: instance.Name, wpaNamespacePromLabel: instance.Namespace, resourceNamespacePromLabel: instance.Namespace, resourceNamePromLabel: instance.Spec.ScaleTargetRef.Name, resourceKindPromLabel: instance.Spec.ScaleTargetRef.Kind}).Set(0) r.eventRecorder.Event(instance, corev1.EventTypeWarning, datadoghqv1alpha1.ReasonFailedSpecCheck, err.Error()) setCondition(instance, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ReasonFailedSpecCheck, "Invalid WPA specification: %s", err) if err = r.updateStatusIfNeeded(ctx, wpaStatusOriginal, instance); err != nil { @@ -184,6 +197,8 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log // the following line are here to retrieve the GVK of the target ref targetGV, err := schema.ParseGroupVersion(wpa.Spec.ScaleTargetRef.APIVersion) if err != nil { + reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: invalidAPIVersionPromLabelVal}).Set(1) + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0) return fmt.Errorf("invalid API version in scale target reference: %v", err) } targetGK := schema.GroupKind{ @@ -192,12 +207,16 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log } mappings, err := r.restMapper.RESTMappings(targetGK) if err != nil { + reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: unknownResourcePromLabelVal}).Set(1) + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0) return fmt.Errorf("unable to determine resource for scale target reference: %v", err) } currentScale, targetGR, err := r.getScaleForResourceMappings(ctx, wpa.Namespace, wpa.Spec.ScaleTargetRef.Name, mappings) if currentScale == nil && strings.Contains(err.Error(), scaleNotFoundErr) { // it is possible that one of the GK in the mappings was not found, but if we have at least one that works, we can continue reconciling. + reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: scaleNotFoundPromLabelVal}).Set(1) + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0) return err } currentReplicas := currentScale.Status.Replicas @@ -248,10 +267,14 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log r.eventRecorder.Event(wpa, corev1.EventTypeWarning, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, err2.Error()) setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, "the WPA controller was unable to update the number of replicas: %v", err) logger.Info("The WPA controller was unable to update the number of replicas", "error", err2) + reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: failedUpdateReplicasPromLabelVal}).Set(1) + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0) return nil } r.eventRecorder.Event(wpa, corev1.EventTypeWarning, "FailedComputeMetricsReplicas", err.Error()) logger.Info("Failed to compute desired number of replicas based on listed metrics.", "reference", reference, "error", err) + reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: failedComputeReplicasPromLabelVal}).Set(1) + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0) return nil } logger.Info("Proposing replicas", "proposedReplicas", proposedReplicas, "metricName", metricName, "reference", reference, "metric timestamp", metricTimestamp.Format(time.RFC1123)) @@ -290,8 +313,12 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log if err := r.updateStatusIfNeeded(ctx, wpaStatusOriginal, wpa); err != nil { r.eventRecorder.Event(wpa, corev1.EventTypeWarning, datadoghqv1alpha1.ReasonFailedUpdateReplicasStatus, err.Error()) setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionFalse, datadoghqv1alpha1.ConditionReasonFailedUpdateReplicasStatus, "the WPA controller was unable to update the number of replicas: %v", err) + reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: failedUpdateReplicasPromLabelVal}).Set(1) + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0) return nil } + reconcileError.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind, reasonPromLabel: failedScalePromLabelVal}).Set(1) + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(0) return nil } setCondition(wpa, autoscalingv2.AbleToScale, corev1.ConditionTrue, datadoghqv1alpha1.ConditionReasonSuccessfulScale, "the WPA controller was able to update the target scale to %d", desiredReplicas) @@ -303,6 +330,7 @@ func (r *WatermarkPodAutoscalerReconciler) reconcileWPA(ctx context.Context, log desiredReplicas = currentReplicas } + reconcileSuccess.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(1) replicaEffective.With(prometheus.Labels{wpaNamePromLabel: wpa.Name, wpaNamespacePromLabel: wpa.Namespace, resourceNamespacePromLabel: wpa.Namespace, resourceNamePromLabel: wpa.Spec.ScaleTargetRef.Name, resourceKindPromLabel: wpa.Spec.ScaleTargetRef.Kind}).Set(float64(desiredReplicas)) // add additional labels to info metric