فهرست منبع

Add prometheus metrics endpoint

Sean Holcomb 4 سال پیش
والد
کامیت
5feb0053a2
4فایلهای تغییر یافته به همراه141 افزوده شده و 2 حذف شده
  1. 30 0
      pkg/costmodel/router.go
  2. 3 0
      pkg/prom/contextnames.go
  3. 106 0
      pkg/prom/diagnostics.go
  4. 2 2
      pkg/prom/result.go

+ 30 - 0
pkg/costmodel/router.go

@@ -884,6 +884,35 @@ func (a *Accesses) GetPrometheusQueueState(w http.ResponseWriter, _ *http.Reques
 	w.Write(WrapData(result, nil))
 }
 
+// GetPrometheusMetrics retrieves availability of Prometheus and Thanos metrics
+func (a *Accesses) GetPrometheusMetrics(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
+	w.Header().Set("Content-Type", "application/json")
+	w.Header().Set("Access-Control-Allow-Origin", "*")
+
+	promMetrics, err := prom.GetPrometheusMetrics(a.PrometheusClient, "")
+	if err != nil {
+		w.Write(WrapData(nil, err))
+		return
+	}
+
+	result := map[string][]*prom.PrometheusDiagnostic{
+		"prometheus": promMetrics,
+	}
+
+	if thanos.IsEnabled() {
+		thanosMetrics, err := prom.GetPrometheusMetrics(a.ThanosClient, thanos.QueryOffset())
+		if err != nil {
+			log.Warningf("Error getting Thanos queue state: %s", err)
+		} else {
+			result["thanos"] = thanosMetrics
+		}
+	}
+
+
+
+	w.Write(WrapData(result, nil))
+}
+
 // Creates a new ClusterManager instance using a boltdb storage. If that fails,
 // then we fall back to a memory-only storage.
 func newClusterManager() *cm.ClusterManager {
@@ -1243,6 +1272,7 @@ func Initialize(additionalConfigWatchers ...ConfigWatchers) *Accesses {
 
 	// diagnostics
 	a.Router.GET("/diagnostics/requestQueue", a.GetPrometheusQueueState)
+	a.Router.GET("/diagnostics/prometheusMetrics", a.GetPrometheusMetrics)
 
 	// cluster manager endpoints
 	a.Router.GET("/clusters", managerEndpoints.GetAllClusters)

+ 3 - 0
pkg/prom/contextnames.go

@@ -21,4 +21,7 @@ const (
 
 	// FrontendContextName is the name we assign queries proxied from the frontend [metadata]
 	FrontendContextName = "frontend"
+
+	// DiagnosticContextName is the name we assign queries that check the state of the prometheus connection
+	DiagnosticContextName = "diagnostic"
 )

+ 106 - 0
pkg/prom/diagnostics.go

@@ -66,3 +66,109 @@ func LogPrometheusClientState(client prometheus.Client) {
 		log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
 	}
 }
+
+// GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
+func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
+	docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
+	ctx := NewNamedContext(client, DiagnosticContextName)
+
+	result := []*PrometheusDiagnostic{
+		{
+			ID:          "cadvisorMetric",
+			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
+			Label:       "cAdvsior metrics available",
+			Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
+		},
+		{
+			ID:          "ksmMetric",
+			Query:       fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m]  %s)`, offset),
+			Label:       "Kube-state-metrics available",
+			Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
+		},
+		{
+			ID:          "kubecostMetric",
+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m]  %s)`, offset),
+			Label:       "Kubecost metrics available",
+			Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
+		},
+		{
+			ID:          "neMetric",
+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m]  %s)`, offset),
+			Label:       "Node-exporter metrics available",
+			Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#node-exporter-metrics-available", docs),
+		},
+		{
+			ID:          "recordingMetric",
+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m]  %s)`, offset),
+			Label:       "Recording rules are available",
+			Description: "Determine if metrics defined by kubecost recording rules are available during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#recording-rules-are-available", docs),
+		},
+		{
+			ID:          "cadvisorLabel",
+			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container_name!="",pod_name!="",instance!~"10.*|192.*|172.*"}[5m]  %s)`, offset),
+			Label:       "Expected cAdvsior labels available",
+			Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
+		},
+		{
+			ID:          "ksmVersion",
+			Query:       fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m]  %s)`, offset),
+			Label:       "Expected kube-state-metrics version found",
+			Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
+			DocLink:     fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
+		},
+		{
+			ID:          "scrapeInterval",
+			Query:       fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m]  %s)`, offset),
+			Label:       "Expected Prometheus self-scrape metrics available",
+			Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
+		},
+		{
+			ID:          "cpuThrottling",
+			Query:       `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
+		/ avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
+			Label:       "Kubecost is not CPU throttled",
+			Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
+		},
+
+	}
+
+	for _, pd := range result {
+		err := pd.executePrometheusDiagnosticQuery(ctx)
+		if err != nil {
+			log.Errorf(err.Error())
+		}
+	}
+
+	return result, nil
+}
+
+// PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
+type PrometheusDiagnostic struct {
+	ID          string         `json:"id"`
+	Query       string         `json:"query"`
+	Label       string         `json:"label"`
+	Description string         `json:"description"`
+	DocLink     string         `json:"docLink"`
+	Result      []*QueryResult `json:"result"`
+	Passed      bool           `json:"passed"`
+}
+
+// executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
+func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
+	resultCh := ctx.Query(pd.Query)
+	result, err := resultCh.Await()
+	if err != nil {
+		return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
+	}
+    if result == nil {
+    	result = []*QueryResult{}
+	}
+	pd.Result = result
+	pd.Passed = len(result) == 0
+	return nil
+}

+ 2 - 2
pkg/prom/result.go

@@ -90,8 +90,8 @@ type QueryResults struct {
 // QueryResult contains a single result from a prometheus query. It's common
 // to refer to query results as a slice of QueryResult
 type QueryResult struct {
-	Metric map[string]interface{}
-	Values []*util.Vector
+	Metric map[string]interface{} `json:"metric"`
+	Values []*util.Vector `json:"values"`
 }
 
 // NewQueryResults accepts the raw prometheus query result and returns an array of