|
|
@@ -9,6 +9,95 @@ import (
|
|
|
prometheus "github.com/prometheus/client_golang/api"
|
|
|
)
|
|
|
|
|
|
+// Prometheus Metric Diagnostic IDs
|
|
|
+const (
|
|
|
+ // CAdvisorDiagnosticMetricID is the identifier of the metric used to determine if cAdvisor is being scraped.
|
|
|
+ CAdvisorDiagnosticMetricID = "cadvisorMetric"
|
|
|
+
|
|
|
+ // CAdvisorLabelDiagnosticMetricID is the identifier of the metric used to determine if cAdvisor labels are correct.
|
|
|
+ CAdvisorLabelDiagnosticMetricID = "cadvisorLabel"
|
|
|
+
|
|
|
+ // KSMDiagnosticMetricID is the identifier for the metric used to determine if KSM metrics are being scraped.
|
|
|
+ KSMDiagnosticMetricID = "ksmMetric"
|
|
|
+
|
|
|
+ // KSMVersionDiagnosticMetricID is the identifier for the metric used to determine if KSM version is correct.
|
|
|
+ KSMVersionDiagnosticMetricID = "ksmVersion"
|
|
|
+
|
|
|
+ // KubecostDiagnosticMetricID is the identifier for the metric used to determine if Kubecost metrics are being scraped.
|
|
|
+ KubecostDiagnosticMetricID = "kubecostMetric"
|
|
|
+
|
|
|
+ // NodeExporterDiagnosticMetricID is the identifier for the metric used to determine if NodeExporter metrics are being scraped.
|
|
|
+ NodeExporterDiagnosticMetricID = "neMetric"
|
|
|
+
|
|
|
+ // ScrapeIntervalDiagnosticMetricID is the identifier for the metric used to determine if prometheus has its own self-scraped
|
|
|
+ // metrics.
|
|
|
+ ScrapeIntervalDiagnosticMetricID = "scrapeInterval"
|
|
|
+
|
|
|
+ // CPUThrottlingDiagnosticMetricID is the identifier for the metric used to determine if CPU throttling is being applied to the
|
|
|
+ // cost-model container.
|
|
|
+ CPUThrottlingDiagnosticMetricID = "cpuThrottling"
|
|
|
+)
|
|
|
+
|
|
|
+const DocumentationBaseURL = "https://github.com/kubecost/docs/blob/master/diagnostics.md"
|
|
|
+
|
|
|
+// diagnostic definitions mapping holds all of the diagnostic definitions that can be used for prometheus metrics diagnostics
|
|
|
+var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnosticDefinition{
|
|
|
+ CAdvisorDiagnosticMetricID: {
|
|
|
+ ID: CAdvisorDiagnosticMetricID,
|
|
|
+ QueryFmt: `absent_over_time(container_cpu_usage_seconds_total[5m] %s)`,
|
|
|
+ Label: "cAdvsior metrics available",
|
|
|
+ Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", DocumentationBaseURL),
|
|
|
+ },
|
|
|
+ KSMDiagnosticMetricID: {
|
|
|
+ ID: KSMDiagnosticMetricID,
|
|
|
+ QueryFmt: `absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`,
|
|
|
+ Label: "Kube-state-metrics available",
|
|
|
+ Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#kube-state-metrics-metrics-available", DocumentationBaseURL),
|
|
|
+ },
|
|
|
+ KubecostDiagnosticMetricID: {
|
|
|
+ ID: KubecostDiagnosticMetricID,
|
|
|
+ QueryFmt: `absent_over_time(node_cpu_hourly_cost[5m] %s)`,
|
|
|
+ Label: "Kubecost metrics available",
|
|
|
+ Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
|
|
|
+ },
|
|
|
+ NodeExporterDiagnosticMetricID: {
|
|
|
+ ID: NodeExporterDiagnosticMetricID,
|
|
|
+ QueryFmt: `absent_over_time(node_cpu_seconds_total[5m] %s)`,
|
|
|
+ Label: "Node-exporter metrics available",
|
|
|
+ Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#node-exporter-metrics-available", DocumentationBaseURL),
|
|
|
+ },
|
|
|
+ CAdvisorLabelDiagnosticMetricID: {
|
|
|
+ ID: CAdvisorLabelDiagnosticMetricID,
|
|
|
+ QueryFmt: `absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`,
|
|
|
+ Label: "Expected cAdvsior labels available",
|
|
|
+ Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", DocumentationBaseURL),
|
|
|
+ },
|
|
|
+ KSMVersionDiagnosticMetricID: {
|
|
|
+ ID: KSMVersionDiagnosticMetricID,
|
|
|
+ QueryFmt: `absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`,
|
|
|
+ Label: "Expected kube-state-metrics version found",
|
|
|
+ Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
|
|
|
+ DocLink: fmt.Sprintf("%s#expected-kube-state-metrics-version-found", DocumentationBaseURL),
|
|
|
+ },
|
|
|
+ ScrapeIntervalDiagnosticMetricID: {
|
|
|
+ ID: ScrapeIntervalDiagnosticMetricID,
|
|
|
+ QueryFmt: `absent_over_time(prometheus_target_interval_length_seconds[5m] %s)`,
|
|
|
+ Label: "Expected Prometheus self-scrape metrics available",
|
|
|
+ Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
|
|
|
+ },
|
|
|
+ CPUThrottlingDiagnosticMetricID: {
|
|
|
+ ID: CPUThrottlingDiagnosticMetricID,
|
|
|
+ QueryFmt: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m] %s)) by (container_name, pod_name, namespace)
|
|
|
+ / avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m] %s)) by (container_name, pod_name, namespace) > 0.2`,
|
|
|
+ Label: "Kubecost is not CPU throttled",
|
|
|
+ Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
|
|
|
+ },
|
|
|
+}
|
|
|
+
|
|
|
// QueuedPromRequest is a representation of a request waiting to be sent by the prometheus
|
|
|
// client.
|
|
|
type QueuedPromRequest struct {
|
|
|
@@ -66,75 +155,89 @@ func LogPrometheusClientState(client prometheus.Client) {
|
|
|
}
|
|
|
|
|
|
// GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
|
|
|
-func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
|
|
|
- docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
|
|
|
+func GetPrometheusMetrics(client prometheus.Client, offset string) PrometheusDiagnostics {
|
|
|
ctx := NewNamedContext(client, DiagnosticContextName)
|
|
|
|
|
|
- result := []*PrometheusDiagnostic{
|
|
|
- {
|
|
|
- ID: "cadvisorMetric",
|
|
|
- Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
|
|
|
- Label: "cAdvsior metrics available",
|
|
|
- Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
|
|
|
- DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
|
|
|
- },
|
|
|
- {
|
|
|
- ID: "ksmMetric",
|
|
|
- Query: fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`, offset),
|
|
|
- Label: "Kube-state-metrics available",
|
|
|
- Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
|
|
|
- DocLink: fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
|
|
|
- },
|
|
|
- {
|
|
|
- ID: "kubecostMetric",
|
|
|
- Query: fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m] %s)`, offset),
|
|
|
- Label: "Kubecost metrics available",
|
|
|
- Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
|
|
|
- },
|
|
|
- {
|
|
|
- ID: "neMetric",
|
|
|
- Query: fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m] %s)`, offset),
|
|
|
- Label: "Node-exporter metrics available",
|
|
|
- Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
|
|
|
- DocLink: fmt.Sprintf("%s#node-exporter-metrics-available", docs),
|
|
|
- },
|
|
|
- {
|
|
|
- ID: "cadvisorLabel",
|
|
|
- Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`, offset),
|
|
|
- Label: "Expected cAdvsior labels available",
|
|
|
- Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
|
|
|
- DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
|
|
|
- },
|
|
|
- {
|
|
|
- ID: "ksmVersion",
|
|
|
- Query: fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`, offset),
|
|
|
- Label: "Expected kube-state-metrics version found",
|
|
|
- Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
|
|
|
- DocLink: fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
|
|
|
- },
|
|
|
- {
|
|
|
- ID: "scrapeInterval",
|
|
|
- Query: fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m] %s)`, offset),
|
|
|
- Label: "Expected Prometheus self-scrape metrics available",
|
|
|
- Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
|
|
|
- },
|
|
|
- {
|
|
|
- ID: "cpuThrottling",
|
|
|
- Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
|
|
|
- / avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
|
|
|
- Label: "Kubecost is not CPU throttled",
|
|
|
- Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
|
|
|
- },
|
|
|
- }
|
|
|
-
|
|
|
- for _, pd := range result {
|
|
|
+ var result []*PrometheusDiagnostic
|
|
|
+ for _, definition := range diagnosticDefinitions {
|
|
|
+ pd := definition.NewDiagnostic(offset)
|
|
|
err := pd.executePrometheusDiagnosticQuery(ctx)
|
|
|
+
|
|
|
+ // log the errror, append to results anyways, and continue
|
|
|
if err != nil {
|
|
|
log.Errorf(err.Error())
|
|
|
}
|
|
|
+ result = append(result, pd)
|
|
|
+ }
|
|
|
+
|
|
|
+ return result
|
|
|
+}
|
|
|
+
|
|
|
+// GetPrometheusMetricsByID returns a list of the state of specific Prometheus metrics by identifier.
|
|
|
+func GetPrometheusMetricsByID(ids []string, client prometheus.Client, offset string) PrometheusDiagnostics {
|
|
|
+ ctx := NewNamedContext(client, DiagnosticContextName)
|
|
|
+
|
|
|
+ var result []*PrometheusDiagnostic
|
|
|
+ for _, id := range ids {
|
|
|
+ if definition, ok := diagnosticDefinitions[id]; ok {
|
|
|
+ pd := definition.NewDiagnostic(offset)
|
|
|
+ err := pd.executePrometheusDiagnosticQuery(ctx)
|
|
|
+
|
|
|
+ // log the errror, append to results anyways, and continue
|
|
|
+ if err != nil {
|
|
|
+ log.Errorf(err.Error())
|
|
|
+ }
|
|
|
+ result = append(result, pd)
|
|
|
+ } else {
|
|
|
+ log.Warnf("Failed to find diagnostic definition for id: %s", id)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return result
|
|
|
+}
|
|
|
+
|
|
|
+// PrometheusDiagnostics is a PrometheusDiagnostic container with helper methods.
|
|
|
+type PrometheusDiagnostics []*PrometheusDiagnostic
|
|
|
+
|
|
|
+// HasFailure returns true if any of the diagnostic tests didn't pass.
|
|
|
+func (pd PrometheusDiagnostics) HasFailure() bool {
|
|
|
+ for _, p := range pd {
|
|
|
+ if !p.Passed {
|
|
|
+ return true
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- return result, nil
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
+// diagnosticDefinition is a definition of a diagnostic that can be used to create new
|
|
|
+// PrometheusDiagnostic instances using the definition's fields.
|
|
|
+type diagnosticDefinition struct {
|
|
|
+ ID string
|
|
|
+ QueryFmt string
|
|
|
+ Label string
|
|
|
+ Description string
|
|
|
+ DocLink string
|
|
|
+}
|
|
|
+
|
|
|
+// NewDiagnostic creates a new PrometheusDiagnostic instance using the provided definition data.
|
|
|
+func (pdd *diagnosticDefinition) NewDiagnostic(offset string) *PrometheusDiagnostic {
|
|
|
+ // FIXME: Any reasonable way to get the total number of replacements required in the query?
|
|
|
+ // FIXME: All of the other queries require a single offset replace, but CPUThrottle requires two.
|
|
|
+ var query string
|
|
|
+ if pdd.ID == CPUThrottlingDiagnosticMetricID {
|
|
|
+ query = fmt.Sprintf(pdd.QueryFmt, offset, offset)
|
|
|
+ } else {
|
|
|
+ query = fmt.Sprintf(pdd.QueryFmt, offset)
|
|
|
+ }
|
|
|
+
|
|
|
+ return &PrometheusDiagnostic{
|
|
|
+ ID: pdd.ID,
|
|
|
+ Query: query,
|
|
|
+ Label: pdd.Label,
|
|
|
+ Description: pdd.Description,
|
|
|
+ DocLink: pdd.DocLink,
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
// PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
|