4 лет назад · 272a050276
--- a/pkg/costmodel/router.go
+++ b/pkg/costmodel/router.go
@@ -882,23 +882,15 @@ func (a *Accesses) GetPrometheusMetrics(w http.ResponseWriter, _ *http.Request,
 
				 	w.Header().Set("Content-Type", "application/json")
			
 
				 	w.Header().Set("Access-Control-Allow-Origin", "*")
			
 
				 
			
 
				-	promMetrics, err := prom.GetPrometheusMetrics(a.PrometheusClient, "")
			
 
				-	if err != nil {
			
 
				-		w.Write(WrapData(nil, err))
			
 
				-		return
			
 
				-	}
			
 
				+	promMetrics := prom.GetPrometheusMetrics(a.PrometheusClient, "")
			
 
				 
			
 
				 	result := map[string][]*prom.PrometheusDiagnostic{
			
 
				 		"prometheus": promMetrics,
			
 
				 	}
			
 
				 
			
 
				 	if thanos.IsEnabled() {
			
 
				-		thanosMetrics, err := prom.GetPrometheusMetrics(a.ThanosClient, thanos.QueryOffset())
			
 
				-		if err != nil {
			
 
				-			log.Warnf("Error getting Thanos queue state: %s", err)
			
 
				-		} else {
			
 
				-			result["thanos"] = thanosMetrics
			
 
				-		}
			
 
				+		thanosMetrics := prom.GetPrometheusMetrics(a.ThanosClient, thanos.QueryOffset())
			
 
				+		result["thanos"] = thanosMetrics
			
 
				 	}
			
 
				 
			
 
				 	w.Write(WrapData(result, nil))
			
--- a/pkg/kubecost/common.go
+++ b/pkg/kubecost/common.go
@@ -14,10 +14,3 @@ func NewPair[T any, U any](first T, second U) Pair[T, U] {
 
				 		Second: second,
			
 
				 	}
			
 
				 }
			
 
				-
			
 
				-// DefaultValue[T] returns the default value for any generic type. This is helpful for generic
			
 
				-// types where a type parameter can be a value type or pointer.
			
 
				-func DefaultValue[T any]() T {
			
 
				-	var t T
			
 
				-	return t
			
 
				-}
			
--- a/pkg/prom/diagnostics.go
+++ b/pkg/prom/diagnostics.go
@@ -9,6 +9,95 @@ import (
 
				 	prometheus "github.com/prometheus/client_golang/api"
			
 
				 )
			
 
				 
			
 
				+// Prometheus Metric Diagnostic IDs
			
 
				+const (
			
 
				+	// CAdvisorDiagnosticMetricID is the identifier of the metric used to determine if cAdvisor is being scraped.
			
 
				+	CAdvisorDiagnosticMetricID = "cadvisorMetric"
			
 
				+
			
 
				+	// CAdvisorLabelDiagnosticMetricID is the identifier of the metric used to determine if cAdvisor labels are correct.
			
 
				+	CAdvisorLabelDiagnosticMetricID = "cadvisorLabel"
			
 
				+
			
 
				+	// KSMDiagnosticMetricID is the identifier for the metric used to determine if KSM metrics are being scraped.
			
 
				+	KSMDiagnosticMetricID = "ksmMetric"
			
 
				+
			
 
				+	// KSMVersionDiagnosticMetricID is the identifier for the metric used to determine if KSM version is correct.
			
 
				+	KSMVersionDiagnosticMetricID = "ksmVersion"
			
 
				+
			
 
				+	// KubecostDiagnosticMetricID is the identifier for the metric used to determine if Kubecost metrics are being scraped.
			
 
				+	KubecostDiagnosticMetricID = "kubecostMetric"
			
 
				+
			
 
				+	// NodeExporterDiagnosticMetricID is the identifier for the metric used to determine if NodeExporter metrics are being scraped.
			
 
				+	NodeExporterDiagnosticMetricID = "neMetric"
			
 
				+
			
 
				+	// ScrapeIntervalDiagnosticMetricID is the identifier for the metric used to determine if prometheus has its own self-scraped
			
 
				+	// metrics.
			
 
				+	ScrapeIntervalDiagnosticMetricID = "scrapeInterval"
			
 
				+
			
 
				+	// CPUThrottlingDiagnosticMetricID is the identifier for the metric used to determine if CPU throttling is being applied to the
			
 
				+	// cost-model container.
			
 
				+	CPUThrottlingDiagnosticMetricID = "cpuThrottling"
			
 
				+)
			
 
				+
			
 
				+const DocumentationBaseURL = "https://github.com/kubecost/docs/blob/master/diagnostics.md"
			
 
				+
			
 
				+// diagnostic definitions mapping holds all of the diagnostic definitions that can be used for prometheus metrics diagnostics
			
 
				+var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnosticDefinition{
			
 
				+	CAdvisorDiagnosticMetricID: {
			
 
				+		ID:          CAdvisorDiagnosticMetricID,
			
 
				+		QueryFmt:    `absent_over_time(container_cpu_usage_seconds_total[5m] %s)`,
			
 
				+		Label:       "cAdvsior metrics available",
			
 
				+		Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
			
 
				+		DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", DocumentationBaseURL),
			
 
				+	},
			
 
				+	KSMDiagnosticMetricID: {
			
 
				+		ID:          KSMDiagnosticMetricID,
			
 
				+		QueryFmt:    `absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`,
			
 
				+		Label:       "Kube-state-metrics available",
			
 
				+		Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
			
 
				+		DocLink:     fmt.Sprintf("%s#kube-state-metrics-metrics-available", DocumentationBaseURL),
			
 
				+	},
			
 
				+	KubecostDiagnosticMetricID: {
			
 
				+		ID:          KubecostDiagnosticMetricID,
			
 
				+		QueryFmt:    `absent_over_time(node_cpu_hourly_cost[5m] %s)`,
			
 
				+		Label:       "Kubecost metrics available",
			
 
				+		Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
			
 
				+	},
			
 
				+	NodeExporterDiagnosticMetricID: {
			
 
				+		ID:          NodeExporterDiagnosticMetricID,
			
 
				+		QueryFmt:    `absent_over_time(node_cpu_seconds_total[5m] %s)`,
			
 
				+		Label:       "Node-exporter metrics available",
			
 
				+		Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
			
 
				+		DocLink:     fmt.Sprintf("%s#node-exporter-metrics-available", DocumentationBaseURL),
			
 
				+	},
			
 
				+	CAdvisorLabelDiagnosticMetricID: {
			
 
				+		ID:          CAdvisorLabelDiagnosticMetricID,
			
 
				+		QueryFmt:    `absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`,
			
 
				+		Label:       "Expected cAdvsior labels available",
			
 
				+		Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
			
 
				+		DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", DocumentationBaseURL),
			
 
				+	},
			
 
				+	KSMVersionDiagnosticMetricID: {
			
 
				+		ID:          KSMVersionDiagnosticMetricID,
			
 
				+		QueryFmt:    `absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`,
			
 
				+		Label:       "Expected kube-state-metrics version found",
			
 
				+		Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
			
 
				+		DocLink:     fmt.Sprintf("%s#expected-kube-state-metrics-version-found", DocumentationBaseURL),
			
 
				+	},
			
 
				+	ScrapeIntervalDiagnosticMetricID: {
			
 
				+		ID:          ScrapeIntervalDiagnosticMetricID,
			
 
				+		QueryFmt:    `absent_over_time(prometheus_target_interval_length_seconds[5m]  %s)`,
			
 
				+		Label:       "Expected Prometheus self-scrape metrics available",
			
 
				+		Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
			
 
				+	},
			
 
				+	CPUThrottlingDiagnosticMetricID: {
			
 
				+		ID: CPUThrottlingDiagnosticMetricID,
			
 
				+		QueryFmt: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m] %s)) by (container_name, pod_name, namespace)
			
 
				+	/ avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m] %s)) by (container_name, pod_name, namespace) > 0.2`,
			
 
				+		Label:       "Kubecost is not CPU throttled",
			
 
				+		Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
			
 
				+	},
			
 
				+}
			
 
				+
			
 
				 // QueuedPromRequest is a representation of a request waiting to be sent by the prometheus
			
 
				 // client.
			
 
				 type QueuedPromRequest struct {
			
@@ -66,75 +155,89 @@ func LogPrometheusClientState(client prometheus.Client) {
 
				 }
			
 
				 
			
 
				 // GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
			
 
				-func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
			
 
				-	docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
			
 
				+func GetPrometheusMetrics(client prometheus.Client, offset string) PrometheusDiagnostics {
			
 
				 	ctx := NewNamedContext(client, DiagnosticContextName)
			
 
				 
			
 
				-	result := []*PrometheusDiagnostic{
			
 
				-		{
			
 
				-			ID:          "cadvisorMetric",
			
 
				-			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
			
 
				-			Label:       "cAdvsior metrics available",
			
 
				-			Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
			
 
				-			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
			
 
				-		},
			
 
				-		{
			
 
				-			ID:          "ksmMetric",
			
 
				-			Query:       fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m]  %s)`, offset),
			
 
				-			Label:       "Kube-state-metrics available",
			
 
				-			Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
			
 
				-			DocLink:     fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
			
 
				-		},
			
 
				-		{
			
 
				-			ID:          "kubecostMetric",
			
 
				-			Query:       fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m]  %s)`, offset),
			
 
				-			Label:       "Kubecost metrics available",
			
 
				-			Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
			
 
				-		},
			
 
				-		{
			
 
				-			ID:          "neMetric",
			
 
				-			Query:       fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m]  %s)`, offset),
			
 
				-			Label:       "Node-exporter metrics available",
			
 
				-			Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
			
 
				-			DocLink:     fmt.Sprintf("%s#node-exporter-metrics-available", docs),
			
 
				-		},
			
 
				-		{
			
 
				-			ID:          "cadvisorLabel",
			
 
				-			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m]  %s)`, offset),
			
 
				-			Label:       "Expected cAdvsior labels available",
			
 
				-			Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
			
 
				-			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
			
 
				-		},
			
 
				-		{
			
 
				-			ID:          "ksmVersion",
			
 
				-			Query:       fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m]  %s)`, offset),
			
 
				-			Label:       "Expected kube-state-metrics version found",
			
 
				-			Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
			
 
				-			DocLink:     fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
			
 
				-		},
			
 
				-		{
			
 
				-			ID:          "scrapeInterval",
			
 
				-			Query:       fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m]  %s)`, offset),
			
 
				-			Label:       "Expected Prometheus self-scrape metrics available",
			
 
				-			Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
			
 
				-		},
			
 
				-		{
			
 
				-			ID: "cpuThrottling",
			
 
				-			Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
			
 
				-		/ avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
			
 
				-			Label:       "Kubecost is not CPU throttled",
			
 
				-			Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
			
 
				-		},
			
 
				-	}
			
 
				-
			
 
				-	for _, pd := range result {
			
 
				+	var result []*PrometheusDiagnostic
			
 
				+	for _, definition := range diagnosticDefinitions {
			
 
				+		pd := definition.NewDiagnostic(offset)
			
 
				 		err := pd.executePrometheusDiagnosticQuery(ctx)
			
 
				+
			
 
				+		// log the errror, append to results anyways, and continue
			
 
				 		if err != nil {
			
 
				 			log.Errorf(err.Error())
			
 
				 		}
			
 
				+		result = append(result, pd)
			
 
				+	}
			
 
				+
			
 
				+	return result
			
 
				+}
			
 
				+
			
 
				+// GetPrometheusMetricsByID returns a list of the state of specific Prometheus metrics by identifier.
			
 
				+func GetPrometheusMetricsByID(ids []string, client prometheus.Client, offset string) PrometheusDiagnostics {
			
 
				+	ctx := NewNamedContext(client, DiagnosticContextName)
			
 
				+
			
 
				+	var result []*PrometheusDiagnostic
			
 
				+	for _, id := range ids {
			
 
				+		if definition, ok := diagnosticDefinitions[id]; ok {
			
 
				+			pd := definition.NewDiagnostic(offset)
			
 
				+			err := pd.executePrometheusDiagnosticQuery(ctx)
			
 
				+
			
 
				+			// log the errror, append to results anyways, and continue
			
 
				+			if err != nil {
			
 
				+				log.Errorf(err.Error())
			
 
				+			}
			
 
				+			result = append(result, pd)
			
 
				+		} else {
			
 
				+			log.Warnf("Failed to find diagnostic definition for id: %s", id)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return result
			
 
				+}
			
 
				+
			
 
				+// PrometheusDiagnostics is a PrometheusDiagnostic container with helper methods.
			
 
				+type PrometheusDiagnostics []*PrometheusDiagnostic
			
 
				+
			
 
				+// HasFailure returns true if any of the diagnostic tests didn't pass.
			
 
				+func (pd PrometheusDiagnostics) HasFailure() bool {
			
 
				+	for _, p := range pd {
			
 
				+		if !p.Passed {
			
 
				+			return true
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	return result, nil
			
 
				+	return false
			
 
				+}
			
 
				+
			
 
				+// diagnosticDefinition is a definition of a diagnostic that can be used to create new
			
 
				+// PrometheusDiagnostic instances using the definition's fields.
			
 
				+type diagnosticDefinition struct {
			
 
				+	ID          string
			
 
				+	QueryFmt    string
			
 
				+	Label       string
			
 
				+	Description string
			
 
				+	DocLink     string
			
 
				+}
			
 
				+
			
 
				+// NewDiagnostic creates a new PrometheusDiagnostic instance using the provided definition data.
			
 
				+func (pdd *diagnosticDefinition) NewDiagnostic(offset string) *PrometheusDiagnostic {
			
 
				+	// FIXME: Any reasonable way to get the total number of replacements required in the query?
			
 
				+	// FIXME: All of the other queries require a single offset replace, but CPUThrottle requires two.
			
 
				+	var query string
			
 
				+	if pdd.ID == CPUThrottlingDiagnosticMetricID {
			
 
				+		query = fmt.Sprintf(pdd.QueryFmt, offset, offset)
			
 
				+	} else {
			
 
				+		query = fmt.Sprintf(pdd.QueryFmt, offset)
			
 
				+	}
			
 
				+
			
 
				+	return &PrometheusDiagnostic{
			
 
				+		ID:          pdd.ID,
			
 
				+		Query:       query,
			
 
				+		Label:       pdd.Label,
			
 
				+		Description: pdd.Description,
			
 
				+		DocLink:     pdd.DocLink,
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 // PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
			
--- a/pkg/util/defaults/defaults.go
+++ b/pkg/util/defaults/defaults.go
@@ -0,0 +1,8 @@
 
				+package defaults
			
 
				+
			
 
				+// Default[T] returns the default value for any generic type. This is helpful for generic
			
 
				+// types where a type parameter can be a value type or pointer.
			
 
				+func Default[T any]() T {
			
 
				+	var t T
			
 
				+	return t
			
 
				+}
			
--- a/pkg/util/retry/retry.go
+++ b/pkg/util/retry/retry.go
@@ -5,6 +5,8 @@ import (
 
				 	"fmt"
			
 
				 	"math/rand"
			
 
				 	"time"
			
 
				+
			
 
				+	"github.com/kubecost/cost-model/pkg/util/defaults"
			
 
				 )
			
 
				 
			
 
				 // RetryCancellationErr is the error type that's returned if the retry is cancelled
			
@@ -16,15 +18,15 @@ func IsRetryCancelledError(err error) bool {
 
				 }
			
 
				 
			
 
				 // Retry will run the f func until we receive a non error result up to the provided attempts or a cancellation.
			
 
				-func Retry(ctx context.Context, f func() (interface{}, error), attempts uint, delay time.Duration) (interface{}, error) {
			
 
				-	var result interface{}
			
 
				+func Retry[T any](ctx context.Context, f func() (T, error), attempts uint, delay time.Duration) (T, error) {
			
 
				+	var result T
			
 
				 	var err error
			
 
				 
			
 
				 	d := delay
			
 
				 	for r := attempts; r > 0; r-- {
			
 
				 		select {
			
 
				 		case <-ctx.Done():
			
 
				-			return nil, RetryCancellationErr
			
 
				+			return defaults.Default[T](), RetryCancellationErr
			
 
				 		default:
			
 
				 		}
			
 
				 
			
--- a/pkg/util/retry/retry_test.go
+++ b/pkg/util/retry/retry_test.go
@@ -18,7 +18,7 @@ func TestPtrSliceRetry(t *testing.T) {
 
				 
			
 
				 	var count uint64 = 0
			
 
				 
			
 
				-	f := func() (interface{}, error) {
			
 
				+	f := func() ([]*Obj, error) {
			
 
				 		c := atomic.AddUint64(&count, 1)
			
 
				 		fmt.Println("Try:", c)
			
 
				 
			
@@ -33,9 +33,8 @@ func TestPtrSliceRetry(t *testing.T) {
 
				 		return nil, fmt.Errorf("Failed: %d", c)
			
 
				 	}
			
 
				 
			
 
				-	result, err := Retry(context.Background(), f, 5, time.Second)
			
 
				-	objs, ok := result.([]*Obj)
			
 
				-	if err != nil || !ok {
			
 
				+	objs, err := Retry(context.Background(), f, 5, time.Second)
			
 
				+	if err != nil {
			
 
				 		t.Fatalf("Failed to correctly cast back to slice type")
			
 
				 	}
			
 
				 
			
@@ -48,12 +47,12 @@ func TestSuccessRetry(t *testing.T) {
 
				 
			
 
				 	var count uint64 = 0
			
 
				 
			
 
				-	f := func() (interface{}, error) {
			
 
				+	f := func() (any, error) {
			
 
				 		c := atomic.AddUint64(&count, 1)
			
 
				 		fmt.Println("Try:", c)
			
 
				 
			
 
				 		if c == Expected {
			
 
				-			return struct{}{}, nil
			
 
				+			return nil, nil
			
 
				 		}
			
 
				 
			
 
				 		return nil, fmt.Errorf("Failed: %d", c)
			
@@ -72,7 +71,7 @@ func TestFailRetry(t *testing.T) {
 
				 	expectedError := fmt.Sprintf("Failed: %d", Expected)
			
 
				 	var count uint64 = 0
			
 
				 
			
 
				-	f := func() (interface{}, error) {
			
 
				+	f := func() (any, error) {
			
 
				 		c := atomic.AddUint64(&count, 1)
			
 
				 		fmt.Println("Try:", c)
			
 
				 		return nil, fmt.Errorf("Failed: %d", c)
			
@@ -95,7 +94,7 @@ func TestCancelRetry(t *testing.T) {
 
				 
			
 
				 	var count uint64 = 0
			
 
				 
			
 
				-	f := func() (interface{}, error) {
			
 
				+	f := func() (any, error) {
			
 
				 		c := atomic.AddUint64(&count, 1)
			
 
				 		fmt.Println("Try:", c)
			
 
				 		return nil, fmt.Errorf("Failed: %d", c)