|
|
@@ -66,3 +66,109 @@ func LogPrometheusClientState(client prometheus.Client) {
|
|
|
log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+// GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
|
|
|
+func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
|
|
|
+ docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
|
|
|
+ ctx := NewNamedContext(client, DiagnosticContextName)
|
|
|
+
|
|
|
+ result := []*PrometheusDiagnostic{
|
|
|
+ {
|
|
|
+ ID: "cadvisorMetric",
|
|
|
+ Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
|
|
|
+ Label: "cAdvsior metrics available",
|
|
|
+ Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
|
|
|
+ },
|
|
|
+ {
|
|
|
+ ID: "ksmMetric",
|
|
|
+ Query: fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`, offset),
|
|
|
+ Label: "Kube-state-metrics available",
|
|
|
+ Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
|
|
|
+ },
|
|
|
+ {
|
|
|
+ ID: "kubecostMetric",
|
|
|
+ Query: fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m] %s)`, offset),
|
|
|
+ Label: "Kubecost metrics available",
|
|
|
+ Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ ID: "neMetric",
|
|
|
+ Query: fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m] %s)`, offset),
|
|
|
+ Label: "Node-exporter metrics available",
|
|
|
+ Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#node-exporter-metrics-available", docs),
|
|
|
+ },
|
|
|
+ {
|
|
|
+ ID: "recordingMetric",
|
|
|
+ Query: fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m] %s)`, offset),
|
|
|
+ Label: "Recording rules are available",
|
|
|
+ Description: "Determine if metrics defined by kubecost recording rules are available during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#recording-rules-are-available", docs),
|
|
|
+ },
|
|
|
+ {
|
|
|
+ ID: "cadvisorLabel",
|
|
|
+ Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container_name!="",pod_name!="",instance!~"10.*|192.*|172.*"}[5m] %s)`, offset),
|
|
|
+ Label: "Expected cAdvsior labels available",
|
|
|
+ Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
|
|
|
+ },
|
|
|
+ {
|
|
|
+ ID: "ksmVersion",
|
|
|
+ Query: fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`, offset),
|
|
|
+ Label: "Expected kube-state-metrics version found",
|
|
|
+ Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
|
|
|
+ },
|
|
|
+ {
|
|
|
+ ID: "scrapeInterval",
|
|
|
+ Query: fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m] %s)`, offset),
|
|
|
+ Label: "Expected Prometheus self-scrape metrics available",
|
|
|
+ Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ ID: "cpuThrottling",
|
|
|
+ Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
|
|
|
+ / avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
|
|
|
+ Label: "Kubecost is not CPU throttled",
|
|
|
+ Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
|
|
|
+ },
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, pd := range result {
|
|
|
+ err := pd.executePrometheusDiagnosticQuery(ctx)
|
|
|
+ if err != nil {
|
|
|
+ log.Errorf(err.Error())
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return result, nil
|
|
|
+}
|
|
|
+
|
|
|
+// PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
|
|
|
+type PrometheusDiagnostic struct {
|
|
|
+ ID string `json:"id"`
|
|
|
+ Query string `json:"query"`
|
|
|
+ Label string `json:"label"`
|
|
|
+ Description string `json:"description"`
|
|
|
+ DocLink string `json:"docLink"`
|
|
|
+ Result []*QueryResult `json:"result"`
|
|
|
+ Passed bool `json:"passed"`
|
|
|
+}
|
|
|
+
|
|
|
+// executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
|
|
|
+func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
|
|
|
+ resultCh := ctx.Query(pd.Query)
|
|
|
+ result, err := resultCh.Await()
|
|
|
+ if err != nil {
|
|
|
+ return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
|
|
|
+ }
|
|
|
+ if result == nil {
|
|
|
+ result = []*QueryResult{}
|
|
|
+ }
|
|
|
+ pd.Result = result
|
|
|
+ pd.Passed = len(result) == 0
|
|
|
+ return nil
|
|
|
+}
|