diagnostics.go 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. package prom
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/kubecost/cost-model/pkg/env"
  6. "github.com/kubecost/cost-model/pkg/log"
  7. prometheus "github.com/prometheus/client_golang/api"
  8. )
  9. // QueuedPromRequest is a representation of a request waiting to be sent by the prometheus
  10. // client.
  11. type QueuedPromRequest struct {
  12. Context string `json:"context"`
  13. Query string `json:"query"`
  14. QueueTime int64 `json:"queueTime"`
  15. }
  16. // PrometheusQueueState contains diagnostic information concerning the state of the prometheus request
  17. // queue
  18. type PrometheusQueueState struct {
  19. QueuedRequests []*QueuedPromRequest `json:"queuedRequests"`
  20. OutboundRequests int `json:"outboundRequests"`
  21. TotalRequests int `json:"totalRequests"`
  22. MaxQueryConcurrency int `json:"maxQueryConcurrency"`
  23. }
  24. // GetPrometheusQueueState is a diagnostic function that probes the prometheus request queue and gathers
  25. // query, context, and queue statistics.
  26. func GetPrometheusQueueState(client prometheus.Client) (*PrometheusQueueState, error) {
  27. rlpc, ok := client.(*RateLimitedPrometheusClient)
  28. if !ok {
  29. return nil, fmt.Errorf("Failed to get prometheus queue state for the provided client. Must be of type RateLimitedPrometheusClient.")
  30. }
  31. outbound := rlpc.TotalOutboundRequests()
  32. requests := []*QueuedPromRequest{}
  33. rlpc.queue.Each(func(_ int, entry interface{}) {
  34. if req, ok := entry.(*workRequest); ok {
  35. requests = append(requests, &QueuedPromRequest{
  36. Context: req.contextName,
  37. Query: req.query,
  38. QueueTime: time.Since(req.start).Milliseconds(),
  39. })
  40. }
  41. })
  42. return &PrometheusQueueState{
  43. QueuedRequests: requests,
  44. OutboundRequests: outbound,
  45. TotalRequests: outbound + len(requests),
  46. MaxQueryConcurrency: env.GetMaxQueryConcurrency(),
  47. }, nil
  48. }
  49. // LogPrometheusClientState logs the current state, with respect to outbound requests, if that
  50. // information is available.
  51. func LogPrometheusClientState(client prometheus.Client) {
  52. if rc, ok := client.(requestCounter); ok {
  53. queued := rc.TotalQueuedRequests()
  54. outbound := rc.TotalOutboundRequests()
  55. total := queued + outbound
  56. log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
  57. }
  58. }
  59. // GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
  60. func GetPrometheusMetrics(client prometheus.Client, offset string) ([]*PrometheusDiagnostic, error) {
  61. docs := "https://github.com/kubecost/docs/blob/master/diagnostics.md"
  62. ctx := NewNamedContext(client, DiagnosticContextName)
  63. result := []*PrometheusDiagnostic{
  64. {
  65. ID: "cadvisorMetric",
  66. Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total[5m] %s)`, offset),
  67. Label: "cAdvsior metrics available",
  68. Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
  69. DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
  70. },
  71. {
  72. ID: "ksmMetric",
  73. Query: fmt.Sprintf(`absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`, offset),
  74. Label: "Kube-state-metrics available",
  75. Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
  76. DocLink: fmt.Sprintf("%s#kube-state-metrics-metrics-available", docs),
  77. },
  78. {
  79. ID: "kubecostMetric",
  80. Query: fmt.Sprintf(`absent_over_time(node_cpu_hourly_cost[5m] %s)`, offset),
  81. Label: "Kubecost metrics available",
  82. Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
  83. },
  84. {
  85. ID: "neMetric",
  86. Query: fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m] %s)`, offset),
  87. Label: "Node-exporter metrics available",
  88. Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
  89. DocLink: fmt.Sprintf("%s#node-exporter-metrics-available", docs),
  90. },
  91. {
  92. ID: "recordingMetric",
  93. Query: fmt.Sprintf(`absent_over_time(node_cpu_seconds_total[5m] %s)`, offset),
  94. Label: "Recording rules are available",
  95. Description: "Determine if metrics defined by kubecost recording rules are available during last 5 minutes.",
  96. DocLink: fmt.Sprintf("%s#recording-rules-are-available", docs),
  97. },
  98. {
  99. ID: "cadvisorLabel",
  100. Query: fmt.Sprintf(`absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`, offset),
  101. Label: "Expected cAdvsior labels available",
  102. Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
  103. DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", docs),
  104. },
  105. {
  106. ID: "ksmVersion",
  107. Query: fmt.Sprintf(`absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`, offset),
  108. Label: "Expected kube-state-metrics version found",
  109. Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
  110. DocLink: fmt.Sprintf("%s#expected-kube-state-metrics-version-found", docs),
  111. },
  112. {
  113. ID: "scrapeInterval",
  114. Query: fmt.Sprintf(`absent_over_time(prometheus_target_interval_length_seconds[5m] %s)`, offset),
  115. Label: "Expected Prometheus self-scrape metrics available",
  116. Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
  117. },
  118. {
  119. ID: "cpuThrottling",
  120. Query: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace)
  121. / avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m])) by (container_name, pod_name, namespace) > 0.2`,
  122. Label: "Kubecost is not CPU throttled",
  123. Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
  124. },
  125. }
  126. for _, pd := range result {
  127. err := pd.executePrometheusDiagnosticQuery(ctx)
  128. if err != nil {
  129. log.Errorf(err.Error())
  130. }
  131. }
  132. return result, nil
  133. }
  134. // PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
  135. type PrometheusDiagnostic struct {
  136. ID string `json:"id"`
  137. Query string `json:"query"`
  138. Label string `json:"label"`
  139. Description string `json:"description"`
  140. DocLink string `json:"docLink"`
  141. Result []*QueryResult `json:"result"`
  142. Passed bool `json:"passed"`
  143. }
  144. // executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
  145. func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
  146. resultCh := ctx.Query(pd.Query)
  147. result, err := resultCh.Await()
  148. if err != nil {
  149. return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
  150. }
  151. if result == nil {
  152. result = []*QueryResult{}
  153. }
  154. pd.Result = result
  155. pd.Passed = len(result) == 0
  156. return nil
  157. }