diagnostics.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. package prom
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/kubecost/cost-model/pkg/env"
  6. "github.com/kubecost/cost-model/pkg/log"
  7. prometheus "github.com/prometheus/client_golang/api"
  8. )
  9. // QueuedPromRequest is a representation of a request waiting to be sent by the prometheus
  10. // client.
  11. type QueuedPromRequest struct {
  12. Context string `json:"context"`
  13. Query string `json:"query"`
  14. QueueTime int64 `json:"queueTime"`
  15. }
  16. // PrometheusQueueState contains diagnostic information concerning the state of the prometheus request
  17. // queue
  18. type PrometheusQueueState struct {
  19. QueuedRequests []*QueuedPromRequest `json:"queuedRequests"`
  20. OutboundRequests int `json:"outboundRequests"`
  21. TotalRequests int `json:"totalRequests"`
  22. MaxQueryConcurrency int `json:"maxQueryConcurrency"`
  23. }
  24. // GetPrometheusQueueState is a diagnostic function that probes the prometheus request queue and gathers
  25. // query, context, and queue statistics.
  26. func GetPrometheusQueueState(client prometheus.Client) (*PrometheusQueueState, error) {
  27. rlpc, ok := client.(*RateLimitedPrometheusClient)
  28. if !ok {
  29. return nil, fmt.Errorf("Failed to get prometheus queue state for the provided client. Must be of type RateLimitedPrometheusClient.")
  30. }
  31. outbound := rlpc.TotalOutboundRequests()
  32. requests := []*QueuedPromRequest{}
  33. rlpc.queue.Each(func(_ int, req *workRequest) {
  34. requests = append(requests, &QueuedPromRequest{
  35. Context: req.contextName,
  36. Query: req.query,
  37. QueueTime: time.Since(req.start).Milliseconds(),
  38. })
  39. })
  40. return &PrometheusQueueState{
  41. QueuedRequests: requests,
  42. OutboundRequests: outbound,
  43. TotalRequests: outbound + len(requests),
  44. MaxQueryConcurrency: env.GetMaxQueryConcurrency(),
  45. }, nil
  46. }
  47. // LogPrometheusClientState logs the current state, with respect to outbound requests, if that
  48. // information is available.
  49. func LogPrometheusClientState(client prometheus.Client) {
  50. if rc, ok := client.(requestCounter); ok {
  51. queued := rc.TotalQueuedRequests()
  52. outbound := rc.TotalOutboundRequests()
  53. total := queued + outbound
  54. log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
  55. }
  56. }
  57. // GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
  58. func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
  59. docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
  60. ctx := NewNamedContext(client, DiagnosticContextName)
  61. result := []*PrometheusDiagnostic{
  62. {
  63. ID: "cadvisorMetric",
  64. Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
  65. Label: "cAdvsior metrics available",
  66. Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
  67. DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
  68. },
  69. {
  70. ID: "ksmMetric",
  71. Query: fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`, offset),
  72. Label: "Kube-state-metrics available",
  73. Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
  74. DocLink: fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
  75. },
  76. {
  77. ID: "kubecostMetric",
  78. Query: fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m] %s)`, offset),
  79. Label: "Kubecost metrics available",
  80. Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
  81. },
  82. {
  83. ID: "neMetric",
  84. Query: fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m] %s)`, offset),
  85. Label: "Node-exporter metrics available",
  86. Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
  87. DocLink: fmt.Sprintf("%s#node-exporter-metrics-available", docs),
  88. },
  89. {
  90. ID: "cadvisorLabel",
  91. Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`, offset),
  92. Label: "Expected cAdvsior labels available",
  93. Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
  94. DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
  95. },
  96. {
  97. ID: "ksmVersion",
  98. Query: fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`, offset),
  99. Label: "Expected kube-state-metrics version found",
  100. Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
  101. DocLink: fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
  102. },
  103. {
  104. ID: "scrapeInterval",
  105. Query: fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m] %s)`, offset),
  106. Label: "Expected Prometheus self-scrape metrics available",
  107. Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
  108. },
  109. {
  110. ID: "cpuThrottling",
  111. Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
  112. / avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
  113. Label: "Kubecost is not CPU throttled",
  114. Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
  115. },
  116. }
  117. for _, pd := range result {
  118. err := pd.executePrometheusDiagnosticQuery(ctx)
  119. if err != nil {
  120. log.Errorf(err.Error())
  121. }
  122. }
  123. return result, nil
  124. }
  125. // PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
  126. type PrometheusDiagnostic struct {
  127. ID string `json:"id"`
  128. Query string `json:"query"`
  129. Label string `json:"label"`
  130. Description string `json:"description"`
  131. DocLink string `json:"docLink"`
  132. Result []*QueryResult `json:"result"`
  133. Passed bool `json:"passed"`
  134. }
  135. // executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
  136. func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
  137. resultCh := ctx.Query(pd.Query)
  138. result, err := resultCh.Await()
  139. if err != nil {
  140. return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
  141. }
  142. if result == nil {
  143. result = []*QueryResult{}
  144. }
  145. pd.Result = result
  146. pd.Passed = len(result) == 0
  147. return nil
  148. }