4 سال پیش · 5feb0053a2
--- a/pkg/costmodel/router.go
+++ b/pkg/costmodel/router.go
@@ -884,6 +884,35 @@ func (a *Accesses) GetPrometheusQueueState(w http.ResponseWriter, _ *http.Reques
 
				 	w.Write(WrapData(result, nil))
			
 
				 }
			
 
				 
			
 
				+// GetPrometheusMetrics retrieves availability of Prometheus and Thanos metrics
			
 
				+func (a *Accesses) GetPrometheusMetrics(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
			
 
				+	w.Header().Set("Content-Type", "application/json")
			
 
				+	w.Header().Set("Access-Control-Allow-Origin", "*")
			
 
				+
			
 
				+	promMetrics, err := prom.GetPrometheusMetrics(a.PrometheusClient, "")
			
 
				+	if err != nil {
			
 
				+		w.Write(WrapData(nil, err))
			
 
				+		return
			
 
				+	}
			
 
				+
			
 
				+	result := map[string][]*prom.PrometheusDiagnostic{
			
 
				+		"prometheus": promMetrics,
			
 
				+	}
			
 
				+
			
 
				+	if thanos.IsEnabled() {
			
 
				+		thanosMetrics, err := prom.GetPrometheusMetrics(a.ThanosClient, thanos.QueryOffset())
			
 
				+		if err != nil {
			
 
				+			log.Warningf("Error getting Thanos queue state: %s", err)
			
 
				+		} else {
			
 
				+			result["thanos"] = thanosMetrics
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+
			
 
				+	w.Write(WrapData(result, nil))
			
 
				+}
			
 
				+
			
 
				 // Creates a new ClusterManager instance using a boltdb storage. If that fails,
			
 
				 // then we fall back to a memory-only storage.
			
 
				 func newClusterManager() *cm.ClusterManager {
			
@@ -1243,6 +1272,7 @@ func Initialize(additionalConfigWatchers ...ConfigWatchers) *Accesses {
 
				 
			
 
				 	// diagnostics
			
 
				 	a.Router.GET("/diagnostics/requestQueue", a.GetPrometheusQueueState)
			
 
				+	a.Router.GET("/diagnostics/prometheusMetrics", a.GetPrometheusMetrics)
			
 
				 
			
 
				 	// cluster manager endpoints
			
 
				 	a.Router.GET("/clusters", managerEndpoints.GetAllClusters)
			
--- a/pkg/prom/contextnames.go
+++ b/pkg/prom/contextnames.go
@@ -21,4 +21,7 @@ const (
 
				 
			
 
				 	// FrontendContextName is the name we assign queries proxied from the frontend [metadata]
			
 
				 	FrontendContextName = "frontend"
			
 
				+
			
 
				+	// DiagnosticContextName is the name we assign queries that check the state of the prometheus connection
			
 
				+	DiagnosticContextName = "diagnostic"
			
 
				 )
			
--- a/pkg/prom/diagnostics.go
+++ b/pkg/prom/diagnostics.go
@@ -66,3 +66,109 @@ func LogPrometheusClientState(client prometheus.Client) {
 
				 		log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+// GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
			
 
				+func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
			
 
				+	docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
			
 
				+	ctx := NewNamedContext(client, DiagnosticContextName)
			
 
				+
			
 
				+	result := []*PrometheusDiagnostic{
			
 
				+		{
			
 
				+			ID:          "cadvisorMetric",
			
 
				+			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
			
 
				+			Label:       "cAdvsior metrics available",
			
 
				+			Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
			
 
				+			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
			
 
				+		},
			
 
				+		{
			
 
				+			ID:          "ksmMetric",
			
 
				+			Query:       fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m]  %s)`, offset),
			
 
				+			Label:       "Kube-state-metrics available",
			
 
				+			Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
			
 
				+			DocLink:     fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
			
 
				+		},
			
 
				+		{
			
 
				+			ID:          "kubecostMetric",
			
 
				+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m]  %s)`, offset),
			
 
				+			Label:       "Kubecost metrics available",
			
 
				+			Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
			
 
				+		},
			
 
				+		{
			
 
				+			ID:          "neMetric",
			
 
				+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m]  %s)`, offset),
			
 
				+			Label:       "Node-exporter metrics available",
			
 
				+			Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
			
 
				+			DocLink:     fmt.Sprintf("%s#node-exporter-metrics-available", docs),
			
 
				+		},
			
 
				+		{
			
 
				+			ID:          "recordingMetric",
			
 
				+			Query:       fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m]  %s)`, offset),
			
 
				+			Label:       "Recording rules are available",
			
 
				+			Description: "Determine if metrics defined by kubecost recording rules are available during last 5 minutes.",
			
 
				+			DocLink:     fmt.Sprintf("%s#recording-rules-are-available", docs),
			
 
				+		},
			
 
				+		{
			
 
				+			ID:          "cadvisorLabel",
			
 
				+			Query:       fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container_name!="",pod_name!="",instance!~"10.*|192.*|172.*"}[5m]  %s)`, offset),
			
 
				+			Label:       "Expected cAdvsior labels available",
			
 
				+			Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
			
 
				+			DocLink:     fmt.Sprintf("%s#cadvisor-metrics-available", docs),
			
 
				+		},
			
 
				+		{
			
 
				+			ID:          "ksmVersion",
			
 
				+			Query:       fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m]  %s)`, offset),
			
 
				+			Label:       "Expected kube-state-metrics version found",
			
 
				+			Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
			
 
				+			DocLink:     fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
			
 
				+		},
			
 
				+		{
			
 
				+			ID:          "scrapeInterval",
			
 
				+			Query:       fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m]  %s)`, offset),
			
 
				+			Label:       "Expected Prometheus self-scrape metrics available",
			
 
				+			Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
			
 
				+		},
			
 
				+		{
			
 
				+			ID:          "cpuThrottling",
			
 
				+			Query:       `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
			
 
				+		/ avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
			
 
				+			Label:       "Kubecost is not CPU throttled",
			
 
				+			Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
			
 
				+		},
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	for _, pd := range result {
			
 
				+		err := pd.executePrometheusDiagnosticQuery(ctx)
			
 
				+		if err != nil {
			
 
				+			log.Errorf(err.Error())
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return result, nil
			
 
				+}
			
 
				+
			
 
				+// PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
			
 
				+type PrometheusDiagnostic struct {
			
 
				+	ID          string         `json:"id"`
			
 
				+	Query       string         `json:"query"`
			
 
				+	Label       string         `json:"label"`
			
 
				+	Description string         `json:"description"`
			
 
				+	DocLink     string         `json:"docLink"`
			
 
				+	Result      []*QueryResult `json:"result"`
			
 
				+	Passed      bool           `json:"passed"`
			
 
				+}
			
 
				+
			
 
				+// executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
			
 
				+func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
			
 
				+	resultCh := ctx.Query(pd.Query)
			
 
				+	result, err := resultCh.Await()
			
 
				+	if err != nil {
			
 
				+		return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
			
 
				+	}
			
 
				+    if result == nil {
			
 
				+    	result = []*QueryResult{}
			
 
				+	}
			
 
				+	pd.Result = result
			
 
				+	pd.Passed = len(result) == 0
			
 
				+	return nil
			
 
				+}
			
--- a/pkg/prom/result.go
+++ b/pkg/prom/result.go
@@ -90,8 +90,8 @@ type QueryResults struct {
 
				 // QueryResult contains a single result from a prometheus query. It's common
			
 
				 // to refer to query results as a slice of QueryResult
			
 
				 type QueryResult struct {
			
 
				-	Metric map[string]interface{}
			
 
				-	Values []*util.Vector
			
 
				+	Metric map[string]interface{} `json:"metric"`
			
 
				+	Values []*util.Vector `json:"values"`
			
 
				 }
			
 
				 
			
 
				 // NewQueryResults accepts the raw prometheus query result and returns an array of