diagnostics.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. package prom
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/kubecost/cost-model/pkg/env"
  6. "github.com/kubecost/cost-model/pkg/log"
  7. prometheus "github.com/prometheus/client_golang/api"
  8. )
  9. // QueuedPromRequest is a representation of a request waiting to be sent by the prometheus
  10. // client.
  11. type QueuedPromRequest struct {
  12. Context string `json:"context"`
  13. Query string `json:"query"`
  14. QueueTime int64 `json:"queueTime"`
  15. }
  16. // PrometheusQueueState contains diagnostic information concerning the state of the prometheus request
  17. // queue
  18. type PrometheusQueueState struct {
  19. QueuedRequests []*QueuedPromRequest `json:"queuedRequests"`
  20. OutboundRequests int `json:"outboundRequests"`
  21. TotalRequests int `json:"totalRequests"`
  22. MaxQueryConcurrency int `json:"maxQueryConcurrency"`
  23. }
  24. // GetPrometheusQueueState is a diagnostic function that probes the prometheus request queue and gathers
  25. // query, context, and queue statistics.
  26. func GetPrometheusQueueState(client prometheus.Client) (*PrometheusQueueState, error) {
  27. rlpc, ok := client.(*RateLimitedPrometheusClient)
  28. if !ok {
  29. return nil, fmt.Errorf("Failed to get prometheus queue state for the provided client. Must be of type RateLimitedPrometheusClient.")
  30. }
  31. outbound := rlpc.TotalOutboundRequests()
  32. requests := []*QueuedPromRequest{}
  33. rlpc.queue.Each(func(_ int, entry interface{}) {
  34. if req, ok := entry.(*workRequest); ok {
  35. requests = append(requests, &QueuedPromRequest{
  36. Context: req.contextName,
  37. Query: req.query,
  38. QueueTime: time.Since(req.start).Milliseconds(),
  39. })
  40. }
  41. })
  42. return &PrometheusQueueState{
  43. QueuedRequests: requests,
  44. OutboundRequests: outbound,
  45. TotalRequests: outbound + len(requests),
  46. MaxQueryConcurrency: env.GetMaxQueryConcurrency(),
  47. }, nil
  48. }
  49. // LogPrometheusClientState logs the current state, with respect to outbound requests, if that
  50. // information is available.
  51. func LogPrometheusClientState(client prometheus.Client) {
  52. if rc, ok := client.(requestCounter); ok {
  53. queued := rc.TotalQueuedRequests()
  54. outbound := rc.TotalOutboundRequests()
  55. total := queued + outbound
  56. log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
  57. }
  58. }
  59. // GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
  60. func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
  61. docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
  62. ctx := NewNamedContext(client, DiagnosticContextName)
  63. result := []*PrometheusDiagnostic{
  64. {
  65. ID: "cadvisorMetric",
  66. Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
  67. Label: "cAdvsior metrics available",
  68. Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
  69. DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
  70. },
  71. {
  72. ID: "ksmMetric",
  73. Query: fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`, offset),
  74. Label: "Kube-state-metrics available",
  75. Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
  76. DocLink: fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
  77. },
  78. {
  79. ID: "kubecostMetric",
  80. Query: fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m] %s)`, offset),
  81. Label: "Kubecost metrics available",
  82. Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
  83. },
  84. {
  85. ID: "neMetric",
  86. Query: fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m] %s)`, offset),
  87. Label: "Node-exporter metrics available",
  88. Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
  89. DocLink: fmt.Sprintf("%s#node-exporter-metrics-available", docs),
  90. },
  91. {
  92. ID: "cadvisorLabel",
  93. Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`, offset),
  94. Label: "Expected cAdvsior labels available",
  95. Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
  96. DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
  97. },
  98. {
  99. ID: "ksmVersion",
  100. Query: fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`, offset),
  101. Label: "Expected kube-state-metrics version found",
  102. Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
  103. DocLink: fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
  104. },
  105. {
  106. ID: "scrapeInterval",
  107. Query: fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m] %s)`, offset),
  108. Label: "Expected Prometheus self-scrape metrics available",
  109. Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
  110. },
  111. {
  112. ID: "cpuThrottling",
  113. Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
  114. / avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
  115. Label: "Kubecost is not CPU throttled",
  116. Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
  117. },
  118. }
  119. for _, pd := range result {
  120. err := pd.executePrometheusDiagnosticQuery(ctx)
  121. if err != nil {
  122. log.Errorf(err.Error())
  123. }
  124. }
  125. return result, nil
  126. }
  127. // PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
  128. type PrometheusDiagnostic struct {
  129. ID string `json:"id"`
  130. Query string `json:"query"`
  131. Label string `json:"label"`
  132. Description string `json:"description"`
  133. DocLink string `json:"docLink"`
  134. Result []*QueryResult `json:"result"`
  135. Passed bool `json:"passed"`
  136. }
  137. // executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
  138. func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
  139. resultCh := ctx.Query(pd.Query)
  140. result, err := resultCh.Await()
  141. if err != nil {
  142. return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
  143. }
  144. if result == nil {
  145. result = []*QueryResult{}
  146. }
  147. pd.Result = result
  148. pd.Passed = len(result) == 0
  149. return nil
  150. }