Просмотр исходного кода

Add more diagnostic functionality, add a defaults package for generic defaults, and update retry to use generic type over any.

Matt Bolt 4 лет назад
Родитель
Сommit
272a050276

+ 3 - 11
pkg/costmodel/router.go

@@ -882,23 +882,15 @@ func (a *Accesses) GetPrometheusMetrics(w http.ResponseWriter, _ *http.Request,
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	promMetrics, err := prom.GetPrometheusMetrics(a.PrometheusClient, "")
-	if err != nil {
-		w.Write(WrapData(nil, err))
-		return
-	}
+	promMetrics := prom.GetPrometheusMetrics(a.PrometheusClient, "")
 
 	result := map[string][]*prom.PrometheusDiagnostic{
 		"prometheus": promMetrics,
 	}
 
 	if thanos.IsEnabled() {
-		thanosMetrics, err := prom.GetPrometheusMetrics(a.ThanosClient, thanos.QueryOffset())
-		if err != nil {
-			log.Warnf("Error getting Thanos queue state: %s", err)
-		} else {
-			result["thanos"] = thanosMetrics
-		}
+		thanosMetrics := prom.GetPrometheusMetrics(a.ThanosClient, thanos.QueryOffset())
+		result["thanos"] = thanosMetrics
 	}
 
 	w.Write(WrapData(result, nil))

+ 0 - 7
pkg/kubecost/common.go

@@ -14,10 +14,3 @@ func NewPair[T any, U any](first T, second U) Pair[T, U] {
 		Second: second,
 	}
 }
-
-// DefaultValue[T] returns the default value for any generic type. This is helpful for generic
-// types where a type parameter can be a value type or pointer.
-func DefaultValue[T any]() T {
-	var t T
-	return t
-}

+ 164 - 61
pkg/prom/diagnostics.go

@@ -9,6 +9,95 @@ import (
 	prometheus "github.com/prometheus/client_golang/api"
 )
 
+// Prometheus Metric Diagnostic IDs
+const (
+	// CAdvisorDiagnosticMetricID is the identifier of the metric used to determine if cAdvisor is being scraped.
+	CAdvisorDiagnosticMetricID = "cadvisorMetric"
+
+	// CAdvisorLabelDiagnosticMetricID is the identifier of the metric used to determine if cAdvisor labels are correct.
+	CAdvisorLabelDiagnosticMetricID = "cadvisorLabel"
+
+	// KSMDiagnosticMetricID is the identifier for the metric used to determine if KSM metrics are being scraped.
+	KSMDiagnosticMetricID = "ksmMetric"
+
+	// KSMVersionDiagnosticMetricID is the identifier for the metric used to determine if KSM version is correct.
+	KSMVersionDiagnosticMetricID = "ksmVersion"
+
+	// KubecostDiagnosticMetricID is the identifier for the metric used to determine if Kubecost metrics are being scraped.
+	KubecostDiagnosticMetricID = "kubecostMetric"
+
+	// NodeExporterDiagnosticMetricID is the identifier for the metric used to determine if NodeExporter metrics are being scraped.
+	NodeExporterDiagnosticMetricID = "neMetric"
+
+	// ScrapeIntervalDiagnosticMetricID is the identifier for the metric used to determine if prometheus has its own self-scraped
+	// metrics.
+	ScrapeIntervalDiagnosticMetricID = "scrapeInterval"
+
+	// CPUThrottlingDiagnosticMetricID is the identifier for the metric used to determine if CPU throttling is being applied to the
+	// cost-model container.
+	CPUThrottlingDiagnosticMetricID = "cpuThrottling"
+)
+
+const DocumentationBaseURL = "https://github.com/kubecost/docs/blob/master/diagnostics.md"
+
+// diagnostic definitions mapping holds all of the diagnostic definitions that can be used for prometheus metrics diagnostics
+var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnosticDefinition{
+	CAdvisorDiagnosticMetricID: {
+		ID:          CAdvisorDiagnosticMetricID,
+		QueryFmt:    `absent_over_time(container_cpu_usage_seconds_total[5m] %s)`,
+		Label:       "cAdvsior metrics available",
+		Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
+		DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", DocumentationBaseURL),
+	},
+	KSMDiagnosticMetricID: {
+		ID:          KSMDiagnosticMetricID,
+		QueryFmt:    `absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`,
+		Label:       "Kube-state-metrics available",
+		Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
+		DocLink:     fmt.Sprintf("%s#kube-state-metrics-metrics-available", DocumentationBaseURL),
+	},
+	KubecostDiagnosticMetricID: {
+		ID:          KubecostDiagnosticMetricID,
+		QueryFmt:    `absent_over_time(node_cpu_hourly_cost[5m] %s)`,
+		Label:       "Kubecost metrics available",
+		Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
+	},
+	NodeExporterDiagnosticMetricID: {
+		ID:          NodeExporterDiagnosticMetricID,
+		QueryFmt:    `absent_over_time(node_cpu_seconds_total[5m] %s)`,
+		Label:       "Node-exporter metrics available",
+		Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
+		DocLink:     fmt.Sprintf("%s#node-exporter-metrics-available", DocumentationBaseURL),
+	},
+	CAdvisorLabelDiagnosticMetricID: {
+		ID:          CAdvisorLabelDiagnosticMetricID,
+		QueryFmt:    `absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`,
+		Label:       "Expected cAdvsior labels available",
+		Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
+		DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", DocumentationBaseURL),
+	},
+	KSMVersionDiagnosticMetricID: {
+		ID:          KSMVersionDiagnosticMetricID,
+		QueryFmt:    `absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`,
+		Label:       "Expected kube-state-metrics version found",
+		Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
+		DocLink:     fmt.Sprintf("%s#expected-kube-state-metrics-version-found", DocumentationBaseURL),
+	},
+	ScrapeIntervalDiagnosticMetricID: {
+		ID:          ScrapeIntervalDiagnosticMetricID,
+		QueryFmt:    `absent_over_time(prometheus_target_interval_length_seconds[5m]  %s)`,
+		Label:       "Expected Prometheus self-scrape metrics available",
+		Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
+	},
+	CPUThrottlingDiagnosticMetricID: {
+		ID: CPUThrottlingDiagnosticMetricID,
+		QueryFmt: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m] %s)) by (container_name, pod_name, namespace)
+	/ avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m] %s)) by (container_name, pod_name, namespace) > 0.2`,
+		Label:       "Kubecost is not CPU throttled",
+		Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
+	},
+}
+
 // QueuedPromRequest is a representation of a request waiting to be sent by the prometheus
 // client.
 type QueuedPromRequest struct {
@@ -66,75 +155,89 @@ func LogPrometheusClientState(client prometheus.Client) {
 }
 
 // GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
-func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
-	docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
+func GetPrometheusMetrics(client prometheus.Client, offset string) PrometheusDiagnostics {
 	ctx := NewNamedContext(client, DiagnosticContextName)
 
-	result := []*PrometheusDiagnostic{
-		{
-			ID:          "cadvisorMetric",
-			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
-			Label:       "cAdvsior metrics available",
-			Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
-			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
-		},
-		{
-			ID:          "ksmMetric",
-			Query:       fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m]  %s)`, offset),
-			Label:       "Kube-state-metrics available",
-			Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
-			DocLink:     fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
-		},
-		{
-			ID:          "kubecostMetric",
-			Query:       fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m]  %s)`, offset),
-			Label:       "Kubecost metrics available",
-			Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
-		},
-		{
-			ID:          "neMetric",
-			Query:       fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m]  %s)`, offset),
-			Label:       "Node-exporter metrics available",
-			Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
-			DocLink:     fmt.Sprintf("%s#node-exporter-metrics-available", docs),
-		},
-		{
-			ID:          "cadvisorLabel",
-			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m]  %s)`, offset),
-			Label:       "Expected cAdvsior labels available",
-			Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
-			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
-		},
-		{
-			ID:          "ksmVersion",
-			Query:       fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m]  %s)`, offset),
-			Label:       "Expected kube-state-metrics version found",
-			Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
-			DocLink:     fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
-		},
-		{
-			ID:          "scrapeInterval",
-			Query:       fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m]  %s)`, offset),
-			Label:       "Expected Prometheus self-scrape metrics available",
-			Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
-		},
-		{
-			ID: "cpuThrottling",
-			Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
-		/ avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
-			Label:       "Kubecost is not CPU throttled",
-			Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
-		},
-	}
-
-	for _, pd := range result {
+	var result []*PrometheusDiagnostic
+	for _, definition := range diagnosticDefinitions {
+		pd := definition.NewDiagnostic(offset)
 		err := pd.executePrometheusDiagnosticQuery(ctx)
+
+		// log the errror, append to results anyways, and continue
 		if err != nil {
 			log.Errorf(err.Error())
 		}
+		result = append(result, pd)
+	}
+
+	return result
+}
+
+// GetPrometheusMetricsByID returns a list of the state of specific Prometheus metrics by identifier.
+func GetPrometheusMetricsByID(ids []string, client prometheus.Client, offset string) PrometheusDiagnostics {
+	ctx := NewNamedContext(client, DiagnosticContextName)
+
+	var result []*PrometheusDiagnostic
+	for _, id := range ids {
+		if definition, ok := diagnosticDefinitions[id]; ok {
+			pd := definition.NewDiagnostic(offset)
+			err := pd.executePrometheusDiagnosticQuery(ctx)
+
+			// log the errror, append to results anyways, and continue
+			if err != nil {
+				log.Errorf(err.Error())
+			}
+			result = append(result, pd)
+		} else {
+			log.Warnf("Failed to find diagnostic definition for id: %s", id)
+		}
+	}
+
+	return result
+}
+
+// PrometheusDiagnostics is a PrometheusDiagnostic container with helper methods.
+type PrometheusDiagnostics []*PrometheusDiagnostic
+
+// HasFailure returns true if any of the diagnostic tests didn't pass.
+func (pd PrometheusDiagnostics) HasFailure() bool {
+	for _, p := range pd {
+		if !p.Passed {
+			return true
+		}
 	}
 
-	return result, nil
+	return false
+}
+
+// diagnosticDefinition is a definition of a diagnostic that can be used to create new
+// PrometheusDiagnostic instances using the definition's fields.
+type diagnosticDefinition struct {
+	ID          string
+	QueryFmt    string
+	Label       string
+	Description string
+	DocLink     string
+}
+
+// NewDiagnostic creates a new PrometheusDiagnostic instance using the provided definition data.
+func (pdd *diagnosticDefinition) NewDiagnostic(offset string) *PrometheusDiagnostic {
+	// FIXME: Any reasonable way to get the total number of replacements required in the query?
+	// FIXME: All of the other queries require a single offset replace, but CPUThrottle requires two.
+	var query string
+	if pdd.ID == CPUThrottlingDiagnosticMetricID {
+		query = fmt.Sprintf(pdd.QueryFmt, offset, offset)
+	} else {
+		query = fmt.Sprintf(pdd.QueryFmt, offset)
+	}
+
+	return &PrometheusDiagnostic{
+		ID:          pdd.ID,
+		Query:       query,
+		Label:       pdd.Label,
+		Description: pdd.Description,
+		DocLink:     pdd.DocLink,
+	}
 }
 
 // PrometheusDiagnostic holds information about a metric and the query to ensure it is functional

+ 8 - 0
pkg/util/defaults/defaults.go

@@ -0,0 +1,8 @@
+package defaults
+
+// Default[T] returns the default value for any generic type. This is helpful for generic
+// types where a type parameter can be a value type or pointer.
+func Default[T any]() T {
+	var t T
+	return t
+}

+ 5 - 3
pkg/util/retry/retry.go

@@ -5,6 +5,8 @@ import (
 	"fmt"
 	"math/rand"
 	"time"
+
+	"github.com/kubecost/cost-model/pkg/util/defaults"
 )
 
 // RetryCancellationErr is the error type that's returned if the retry is cancelled
@@ -16,15 +18,15 @@ func IsRetryCancelledError(err error) bool {
 }
 
 // Retry will run the f func until we receive a non error result up to the provided attempts or a cancellation.
-func Retry(ctx context.Context, f func() (interface{}, error), attempts uint, delay time.Duration) (interface{}, error) {
-	var result interface{}
+func Retry[T any](ctx context.Context, f func() (T, error), attempts uint, delay time.Duration) (T, error) {
+	var result T
 	var err error
 
 	d := delay
 	for r := attempts; r > 0; r-- {
 		select {
 		case <-ctx.Done():
-			return nil, RetryCancellationErr
+			return defaults.Default[T](), RetryCancellationErr
 		default:
 		}
 

+ 7 - 8
pkg/util/retry/retry_test.go

@@ -18,7 +18,7 @@ func TestPtrSliceRetry(t *testing.T) {
 
 	var count uint64 = 0
 
-	f := func() (interface{}, error) {
+	f := func() ([]*Obj, error) {
 		c := atomic.AddUint64(&count, 1)
 		fmt.Println("Try:", c)
 
@@ -33,9 +33,8 @@ func TestPtrSliceRetry(t *testing.T) {
 		return nil, fmt.Errorf("Failed: %d", c)
 	}
 
-	result, err := Retry(context.Background(), f, 5, time.Second)
-	objs, ok := result.([]*Obj)
-	if err != nil || !ok {
+	objs, err := Retry(context.Background(), f, 5, time.Second)
+	if err != nil {
 		t.Fatalf("Failed to correctly cast back to slice type")
 	}
 
@@ -48,12 +47,12 @@ func TestSuccessRetry(t *testing.T) {
 
 	var count uint64 = 0
 
-	f := func() (interface{}, error) {
+	f := func() (any, error) {
 		c := atomic.AddUint64(&count, 1)
 		fmt.Println("Try:", c)
 
 		if c == Expected {
-			return struct{}{}, nil
+			return nil, nil
 		}
 
 		return nil, fmt.Errorf("Failed: %d", c)
@@ -72,7 +71,7 @@ func TestFailRetry(t *testing.T) {
 	expectedError := fmt.Sprintf("Failed: %d", Expected)
 	var count uint64 = 0
 
-	f := func() (interface{}, error) {
+	f := func() (any, error) {
 		c := atomic.AddUint64(&count, 1)
 		fmt.Println("Try:", c)
 		return nil, fmt.Errorf("Failed: %d", c)
@@ -95,7 +94,7 @@ func TestCancelRetry(t *testing.T) {
 
 	var count uint64 = 0
 
-	f := func() (interface{}, error) {
+	f := func() (any, error) {
 		c := atomic.AddUint64(&count, 1)
 		fmt.Println("Try:", c)
 		return nil, fmt.Errorf("Failed: %d", c)