Просмотр исходного кода

Merge pull request #913 from kubecost/sean/prom-diagnostics

Add prometheus metrics endpoint
Sean Holcomb 4 лет назад
Родитель
Сommit
eb1dbcaab5

+ 28 - 0
pkg/costmodel/router.go

@@ -884,6 +884,33 @@ func (a *Accesses) GetPrometheusQueueState(w http.ResponseWriter, _ *http.Reques
 	w.Write(WrapData(result, nil))
 }
 
+// GetPrometheusMetrics retrieves availability of Prometheus and Thanos metrics
+func (a *Accesses) GetPrometheusMetrics(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
+	w.Header().Set("Content-Type", "application/json")
+	w.Header().Set("Access-Control-Allow-Origin", "*")
+
+	promMetrics, err := prom.GetPrometheusMetrics(a.PrometheusClient, "")
+	if err != nil {
+		w.Write(WrapData(nil, err))
+		return
+	}
+
+	result := map[string][]*prom.PrometheusDiagnostic{
+		"prometheus": promMetrics,
+	}
+
+	if thanos.IsEnabled() {
+		thanosMetrics, err := prom.GetPrometheusMetrics(a.ThanosClient, thanos.QueryOffset())
+		if err != nil {
+			log.Warningf("Error getting Thanos queue state: %s", err)
+		} else {
+			result["thanos"] = thanosMetrics
+		}
+	}
+
+	w.Write(WrapData(result, nil))
+}
+
 // Creates a new ClusterManager instance using a boltdb storage. If that fails,
 // then we fall back to a memory-only storage.
 func newClusterManager() *cm.ClusterManager {
@@ -1243,6 +1270,7 @@ func Initialize(additionalConfigWatchers ...ConfigWatchers) *Accesses {
 
 	// diagnostics
 	a.Router.GET("/diagnostics/requestQueue", a.GetPrometheusQueueState)
+	a.Router.GET("/diagnostics/prometheusMetrics", a.GetPrometheusMetrics)
 
 	// cluster manager endpoints
 	a.Router.GET("/clusters", managerEndpoints.GetAllClusters)

+ 3 - 0
pkg/prom/contextnames.go

@@ -21,4 +21,7 @@ const (
 
 	// FrontendContextName is the name we assign queries proxied from the frontend [metadata]
 	FrontendContextName = "frontend"
+
+	// DiagnosticContextName is the name we assign queries that check the state of the prometheus connection
+	DiagnosticContextName = "diagnostic"
 )

+ 105 - 0
pkg/prom/diagnostics.go

@@ -66,3 +66,108 @@ func LogPrometheusClientState(client prometheus.Client) {
 		log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
 	}
 }
+
+// GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
+func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
+	docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
+	ctx := NewNamedContext(client, DiagnosticContextName)
+
+	result := []*PrometheusDiagnostic{
+		{
+			ID:          "cadvisorMetric",
+			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
+			Label:       "cAdvsior metrics available",
+			Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
+		},
+		{
+			ID:          "ksmMetric",
+			Query:       fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m]  %s)`, offset),
+			Label:       "Kube-state-metrics available",
+			Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
+		},
+		{
+			ID:          "kubecostMetric",
+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m]  %s)`, offset),
+			Label:       "Kubecost metrics available",
+			Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
+		},
+		{
+			ID:          "neMetric",
+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m]  %s)`, offset),
+			Label:       "Node-exporter metrics available",
+			Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#node-exporter-metrics-available", docs),
+		},
+		{
+			ID:          "recordingMetric",
+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m]  %s)`, offset),
+			Label:       "Recording rules are available",
+			Description: "Determine if metrics defined by kubecost recording rules are available during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#recording-rules-are-available", docs),
+		},
+		{
+			ID:          "cadvisorLabel",
+			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container_name!="",pod_name!="",instance!~"10.*|192.*|172.*"}[5m]  %s)`, offset),
+			Label:       "Expected cAdvsior labels available",
+			Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
+		},
+		{
+			ID:          "ksmVersion",
+			Query:       fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m]  %s)`, offset),
+			Label:       "Expected kube-state-metrics version found",
+			Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
+		},
+		{
+			ID:          "scrapeInterval",
+			Query:       fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m]  %s)`, offset),
+			Label:       "Expected Prometheus self-scrape metrics available",
+			Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
+		},
+		{
+			ID: "cpuThrottling",
+			Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
+		/ avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
+			Label:       "Kubecost is not CPU throttled",
+			Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
+		},
+	}
+
+	for _, pd := range result {
+		err := pd.executePrometheusDiagnosticQuery(ctx)
+		if err != nil {
+			log.Errorf(err.Error())
+		}
+	}
+
+	return result, nil
+}
+
+// PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
+type PrometheusDiagnostic struct {
+	ID          string         `json:"id"`
+	Query       string         `json:"query"`
+	Label       string         `json:"label"`
+	Description string         `json:"description"`
+	DocLink     string         `json:"docLink"`
+	Result      []*QueryResult `json:"result"`
+	Passed      bool           `json:"passed"`
+}
+
+// executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
+func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
+	resultCh := ctx.Query(pd.Query)
+	result, err := resultCh.Await()
+	if err != nil {
+		return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
+	}
+	if result == nil {
+		result = []*QueryResult{}
+	}
+	pd.Result = result
+	pd.Passed = len(result) == 0
+	return nil
+}

+ 2 - 2
pkg/prom/result.go

@@ -90,8 +90,8 @@ type QueryResults struct {
 // QueryResult contains a single result from a prometheus query. It's common
 // to refer to query results as a slice of QueryResult
 type QueryResult struct {
-	Metric map[string]interface{}
-	Values []*util.Vector
+	Metric map[string]interface{} `json:"metric"`
+	Values []*util.Vector         `json:"values"`
 }
 
 // NewQueryResults accepts the raw prometheus query result and returns an array of

+ 0 - 3
pkg/util/mapper/mapper.go

@@ -496,6 +496,3 @@ func (wom *writeOnlyMapper) SetDuration(key string, value time.Duration) error {
 func (wom *writeOnlyMapper) SetList(key string, values []string, delimiter string) error {
 	return wom.setter.Set(key, strings.Join(values, delimiter))
 }
-
-
-

+ 1 - 1
pkg/util/timeutil/timeutil.go

@@ -237,4 +237,4 @@ func (jt *JobTicker) TickIn(d time.Duration) {
 			jt.ch <- time.Now()
 		}
 	}(d)
-}
+}

+ 45 - 48
pkg/util/timeutil/timeutil_test.go

@@ -7,7 +7,7 @@ import (
 
 func Test_DurationString(t *testing.T) {
 	testCases := map[string]struct {
-		duration time.Duration
+		duration         time.Duration
 		expectedDuration string
 	}{
 		"1a": {
@@ -15,36 +15,36 @@ func Test_DurationString(t *testing.T) {
 			expectedDuration: "",
 		},
 		"1b": {
-			duration:         24*time.Hour,
+			duration:         24 * time.Hour,
 			expectedDuration: "1d",
 		},
 		"1c": {
-			duration:         24*time.Hour+5*time.Minute,
+			duration:         24*time.Hour + 5*time.Minute,
 			expectedDuration: "1445m",
 		},
 		"1d": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "25h",
 		},
 		"1e": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "25h",
 		},
 		"1f": {
-			duration:         72*time.Hour,
+			duration:         72 * time.Hour,
 			expectedDuration: "3d",
 		},
 		"1g": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "25h",
 		},
 		"1h": {
-			duration:         24*time.Hour+time.Second,
+			duration:         24*time.Hour + time.Second,
 			expectedDuration: "86401s",
 		},
 		// Expect empty strings if durations are negative
 		"1i": {
-			duration:         -25*time.Hour,
+			duration:         -25 * time.Hour,
 			expectedDuration: "",
 		},
 	}
@@ -61,7 +61,7 @@ func Test_DurationString(t *testing.T) {
 
 func Test_DurationToPromOffsetString(t *testing.T) {
 	testCases := map[string]struct {
-		duration time.Duration
+		duration         time.Duration
 		expectedDuration string
 	}{
 		"1a": {
@@ -69,36 +69,36 @@ func Test_DurationToPromOffsetString(t *testing.T) {
 			expectedDuration: "",
 		},
 		"1b": {
-			duration:         24*time.Hour,
+			duration:         24 * time.Hour,
 			expectedDuration: "offset 1d",
 		},
 		"1c": {
-			duration:         24*time.Hour+5*time.Minute,
+			duration:         24*time.Hour + 5*time.Minute,
 			expectedDuration: "offset 1445m",
 		},
 		"1d": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "offset 25h",
 		},
 		"1e": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "offset 25h",
 		},
 		"1f": {
-			duration:         72*time.Hour,
+			duration:         72 * time.Hour,
 			expectedDuration: "offset 3d",
 		},
 		"1g": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "offset 25h",
 		},
 		"1h": {
-			duration:         24*time.Hour+time.Second,
+			duration:         24*time.Hour + time.Second,
 			expectedDuration: "offset 86401s",
 		},
 		// Expect empty strings if durations are negative
 		"1i": {
-			duration:         -25*time.Hour,
+			duration:         -25 * time.Hour,
 			expectedDuration: "",
 		},
 	}
@@ -115,7 +115,7 @@ func Test_DurationToPromOffsetString(t *testing.T) {
 
 func Test_FormatStoreResolution(t *testing.T) {
 	testCases := map[string]struct {
-		duration time.Duration
+		duration         time.Duration
 		expectedDuration string
 	}{
 		"1a": {
@@ -123,36 +123,36 @@ func Test_FormatStoreResolution(t *testing.T) {
 			expectedDuration: "0s",
 		},
 		"1b": {
-			duration:         24*time.Hour,
+			duration:         24 * time.Hour,
 			expectedDuration: "1d",
 		},
 		"1c": {
-			duration:         24*time.Hour+5*time.Minute,
+			duration:         24*time.Hour + 5*time.Minute,
 			expectedDuration: "1d",
 		},
 		"1d": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "1d",
 		},
 		"1e": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "1d",
 		},
 		"1f": {
-			duration:         72*time.Hour,
+			duration:         72 * time.Hour,
 			expectedDuration: "3d",
 		},
 		"1g": {
-			duration:         25*time.Hour,
+			duration:         25 * time.Hour,
 			expectedDuration: "1d",
 		},
 		"1h": {
-			duration:         24*time.Hour+time.Second,
+			duration:         24*time.Hour + time.Second,
 			expectedDuration: "1d",
 		},
 		// Expect empty strings if durations are negative
 		"1i": {
-			duration:         -25*time.Hour,
+			duration:         -25 * time.Hour,
 			expectedDuration: "-25h0m0s",
 		},
 	}
@@ -169,10 +169,10 @@ func Test_FormatStoreResolution(t *testing.T) {
 
 func Test_DurationOffsetStrings(t *testing.T) {
 	testCases := map[string]struct {
-		duration time.Duration
-		offset time.Duration
+		duration         time.Duration
+		offset           time.Duration
 		expectedDuration string
-		expectedOffset string
+		expectedOffset   string
 	}{
 		"1a": {
 			duration:         0,
@@ -181,51 +181,51 @@ func Test_DurationOffsetStrings(t *testing.T) {
 			expectedOffset:   "",
 		},
 		"1b": {
-			duration:         24*time.Hour,
+			duration:         24 * time.Hour,
 			offset:           0,
 			expectedDuration: "1d",
 			expectedOffset:   "",
 		},
 		"1c": {
-			duration:         24*time.Hour+5*time.Minute,
+			duration:         24*time.Hour + 5*time.Minute,
 			offset:           0,
 			expectedDuration: "1445m",
 			expectedOffset:   "",
 		},
 		"1d": {
-			duration:         25*time.Hour,
-			offset:           5*time.Minute,
+			duration:         25 * time.Hour,
+			offset:           5 * time.Minute,
 			expectedDuration: "25h",
 			expectedOffset:   "5m",
 		},
 		"1e": {
-			duration:         25*time.Hour,
-			offset:           60*time.Minute,
+			duration:         25 * time.Hour,
+			offset:           60 * time.Minute,
 			expectedDuration: "25h",
 			expectedOffset:   "1h",
 		},
 		"1f": {
-			duration:         72*time.Hour,
-			offset:           1440*time.Minute,
+			duration:         72 * time.Hour,
+			offset:           1440 * time.Minute,
 			expectedDuration: "3d",
 			expectedOffset:   "1d",
 		},
 		"1g": {
-			duration:         25*time.Hour,
-			offset:           1*time.Second,
+			duration:         25 * time.Hour,
+			offset:           1 * time.Second,
 			expectedDuration: "25h",
 			expectedOffset:   "1s",
 		},
 		"1h": {
-			duration:         24*time.Hour+time.Second,
-			offset:           1*time.Second,
+			duration:         24*time.Hour + time.Second,
+			offset:           1 * time.Second,
 			expectedDuration: "86401s",
 			expectedOffset:   "1s",
 		},
 		// Expect empty strings if durations are negative
 		"1i": {
-			duration:         -25*time.Hour,
-			offset:           -1*time.Second,
+			duration:         -25 * time.Hour,
+			offset:           -1 * time.Second,
 			expectedDuration: "",
 			expectedOffset:   "",
 		},
@@ -233,7 +233,7 @@ func Test_DurationOffsetStrings(t *testing.T) {
 
 	for name, test := range testCases {
 		t.Run(name, func(t *testing.T) {
-			dur, off:= DurationOffsetStrings(test.duration, test.offset)
+			dur, off := DurationOffsetStrings(test.duration, test.offset)
 			if dur != test.expectedDuration || off != test.expectedOffset {
 				t.Fatalf("DurationOffsetStrings: exp (%s %s); act (%s, %s)", test.expectedDuration, test.expectedOffset, dur, off)
 			}
@@ -322,8 +322,6 @@ func Test_CleanDurationString(t *testing.T) {
 			input:    "oqwd3dk5hk",
 			expected: "oqwd3dk5hk",
 		},
-
-
 	}
 	for name, test := range testCases {
 		t.Run(name, func(t *testing.T) {
@@ -368,7 +366,6 @@ func Test_FormatDurationStringDaysToHours(t *testing.T) {
 			input:    "oqwd3dk5hk",
 			expected: "oqwd3dk5hk",
 		},
-
 	}
 	for name, test := range testCases {
 		t.Run(name, func(t *testing.T) {