2
0

diagnostics.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. package prom
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/opencost/opencost/pkg/env"
  6. "github.com/opencost/opencost/pkg/log"
  7. prometheus "github.com/prometheus/client_golang/api"
  8. )
  9. // Prometheus Metric Diagnostic IDs
  10. const (
  11. // CAdvisorDiagnosticMetricID is the identifier of the metric used to determine if cAdvisor is being scraped.
  12. CAdvisorDiagnosticMetricID = "cadvisorMetric"
  13. // CAdvisorLabelDiagnosticMetricID is the identifier of the metric used to determine if cAdvisor labels are correct.
  14. CAdvisorLabelDiagnosticMetricID = "cadvisorLabel"
  15. // KSMDiagnosticMetricID is the identifier for the metric used to determine if KSM metrics are being scraped.
  16. KSMDiagnosticMetricID = "ksmMetric"
  17. // KSMVersionDiagnosticMetricID is the identifier for the metric used to determine if KSM version is correct.
  18. KSMVersionDiagnosticMetricID = "ksmVersion"
  19. // KubecostDiagnosticMetricID is the identifier for the metric used to determine if Kubecost metrics are being scraped.
  20. KubecostDiagnosticMetricID = "kubecostMetric"
  21. // NodeExporterDiagnosticMetricID is the identifier for the metric used to determine if NodeExporter metrics are being scraped.
  22. NodeExporterDiagnosticMetricID = "neMetric"
  23. // ScrapeIntervalDiagnosticMetricID is the identifier for the metric used to determine if prometheus has its own self-scraped
  24. // metrics.
  25. ScrapeIntervalDiagnosticMetricID = "scrapeInterval"
  26. // CPUThrottlingDiagnosticMetricID is the identifier for the metric used to determine if CPU throttling is being applied to the
  27. // cost-model container.
  28. CPUThrottlingDiagnosticMetricID = "cpuThrottling"
  29. )
  30. const DocumentationBaseURL = "https://github.com/kubecost/docs/blob/master/diagnostics.md"
  31. // diagnostic definitions mapping holds all of the diagnostic definitions that can be used for prometheus metrics diagnostics
  32. var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnosticDefinition{
  33. CAdvisorDiagnosticMetricID: {
  34. ID: CAdvisorDiagnosticMetricID,
  35. QueryFmt: `absent_over_time(container_cpu_usage_seconds_total[5m] %s)`,
  36. Label: "cAdvsior metrics available",
  37. Description: "Determine if cAdvisor metrics are available during last 5 minutes.",
  38. DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", DocumentationBaseURL),
  39. },
  40. KSMDiagnosticMetricID: {
  41. ID: KSMDiagnosticMetricID,
  42. QueryFmt: `absent_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte"}[5m] %s)`,
  43. Label: "Kube-state-metrics available",
  44. Description: "Determine if metrics from kube-state-metrics are available during last 5 minutes.",
  45. DocLink: fmt.Sprintf("%s#kube-state-metrics-metrics-available", DocumentationBaseURL),
  46. },
  47. KubecostDiagnosticMetricID: {
  48. ID: KubecostDiagnosticMetricID,
  49. QueryFmt: `absent_over_time(node_cpu_hourly_cost[5m] %s)`,
  50. Label: "Kubecost metrics available",
  51. Description: "Determine if metrics from Kubecost are available during last 5 minutes.",
  52. },
  53. NodeExporterDiagnosticMetricID: {
  54. ID: NodeExporterDiagnosticMetricID,
  55. QueryFmt: `absent_over_time(node_cpu_seconds_total[5m] %s)`,
  56. Label: "Node-exporter metrics available",
  57. Description: "Determine if metrics from node-exporter are available during last 5 minutes.",
  58. DocLink: fmt.Sprintf("%s#node-exporter-metrics-available", DocumentationBaseURL),
  59. },
  60. CAdvisorLabelDiagnosticMetricID: {
  61. ID: CAdvisorLabelDiagnosticMetricID,
  62. QueryFmt: `absent_over_time(container_cpu_usage_seconds_total{container!="",pod!=""}[5m] %s)`,
  63. Label: "Expected cAdvsior labels available",
  64. Description: "Determine if expected cAdvisor labels are present during last 5 minutes.",
  65. DocLink: fmt.Sprintf("%s#cadvisor-metrics-available", DocumentationBaseURL),
  66. },
  67. KSMVersionDiagnosticMetricID: {
  68. ID: KSMVersionDiagnosticMetricID,
  69. QueryFmt: `absent_over_time(kube_persistentvolume_capacity_bytes[5m] %s)`,
  70. Label: "Expected kube-state-metrics version found",
  71. Description: "Determine if metric in required kube-state-metrics version are present during last 5 minutes.",
  72. DocLink: fmt.Sprintf("%s#expected-kube-state-metrics-version-found", DocumentationBaseURL),
  73. },
  74. ScrapeIntervalDiagnosticMetricID: {
  75. ID: ScrapeIntervalDiagnosticMetricID,
  76. QueryFmt: `absent_over_time(prometheus_target_interval_length_seconds[5m] %s)`,
  77. Label: "Expected Prometheus self-scrape metrics available",
  78. Description: "Determine if prometheus has its own self-scraped metrics during the last 5 minutes.",
  79. },
  80. CPUThrottlingDiagnosticMetricID: {
  81. ID: CPUThrottlingDiagnosticMetricID,
  82. QueryFmt: `avg(increase(container_cpu_cfs_throttled_periods_total{container="cost-model"}[10m] %s)) by (container_name, pod_name, namespace)
  83. / avg(increase(container_cpu_cfs_periods_total{container="cost-model"}[10m] %s)) by (container_name, pod_name, namespace) > 0.2`,
  84. Label: "Kubecost is not CPU throttled",
  85. Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
  86. },
  87. }
  88. // QueuedPromRequest is a representation of a request waiting to be sent by the prometheus
  89. // client.
  90. type QueuedPromRequest struct {
  91. Context string `json:"context"`
  92. Query string `json:"query"`
  93. QueueTime int64 `json:"queueTime"`
  94. }
  95. // PrometheusQueueState contains diagnostic information concerning the state of the prometheus request
  96. // queue
  97. type PrometheusQueueState struct {
  98. QueuedRequests []*QueuedPromRequest `json:"queuedRequests"`
  99. OutboundRequests int `json:"outboundRequests"`
  100. TotalRequests int `json:"totalRequests"`
  101. MaxQueryConcurrency int `json:"maxQueryConcurrency"`
  102. }
  103. // GetPrometheusQueueState is a diagnostic function that probes the prometheus request queue and gathers
  104. // query, context, and queue statistics.
  105. func GetPrometheusQueueState(client prometheus.Client) (*PrometheusQueueState, error) {
  106. rlpc, ok := client.(*RateLimitedPrometheusClient)
  107. if !ok {
  108. return nil, fmt.Errorf("Failed to get prometheus queue state for the provided client. Must be of type RateLimitedPrometheusClient.")
  109. }
  110. outbound := rlpc.TotalOutboundRequests()
  111. requests := []*QueuedPromRequest{}
  112. rlpc.queue.Each(func(_ int, req *workRequest) {
  113. requests = append(requests, &QueuedPromRequest{
  114. Context: req.contextName,
  115. Query: req.query,
  116. QueueTime: time.Since(req.start).Milliseconds(),
  117. })
  118. })
  119. return &PrometheusQueueState{
  120. QueuedRequests: requests,
  121. OutboundRequests: outbound,
  122. TotalRequests: outbound + len(requests),
  123. MaxQueryConcurrency: env.GetMaxQueryConcurrency(),
  124. }, nil
  125. }
  126. // LogPrometheusClientState logs the current state, with respect to outbound requests, if that
  127. // information is available.
  128. func LogPrometheusClientState(client prometheus.Client) {
  129. if rc, ok := client.(requestCounter); ok {
  130. queued := rc.TotalQueuedRequests()
  131. outbound := rc.TotalOutboundRequests()
  132. total := queued + outbound
  133. log.Infof("Outbound Requests: %d, Queued Requests: %d, Total Requests: %d", outbound, queued, total)
  134. }
  135. }
  136. // GetPrometheusMetrics returns a list of the state of Prometheus metric used by kubecost using the provided client
  137. func GetPrometheusMetrics(client prometheus.Client, offset string) PrometheusDiagnostics {
  138. ctx := NewNamedContext(client, DiagnosticContextName)
  139. var result []*PrometheusDiagnostic
  140. for _, definition := range diagnosticDefinitions {
  141. pd := definition.NewDiagnostic(offset)
  142. err := pd.executePrometheusDiagnosticQuery(ctx)
  143. // log the errror, append to results anyways, and continue
  144. if err != nil {
  145. log.Errorf(err.Error())
  146. }
  147. result = append(result, pd)
  148. }
  149. return result
  150. }
  151. // GetPrometheusMetricsByID returns a list of the state of specific Prometheus metrics by identifier.
  152. func GetPrometheusMetricsByID(ids []string, client prometheus.Client, offset string) PrometheusDiagnostics {
  153. ctx := NewNamedContext(client, DiagnosticContextName)
  154. var result []*PrometheusDiagnostic
  155. for _, id := range ids {
  156. if definition, ok := diagnosticDefinitions[id]; ok {
  157. pd := definition.NewDiagnostic(offset)
  158. err := pd.executePrometheusDiagnosticQuery(ctx)
  159. // log the errror, append to results anyways, and continue
  160. if err != nil {
  161. log.Errorf(err.Error())
  162. }
  163. result = append(result, pd)
  164. } else {
  165. log.Warnf("Failed to find diagnostic definition for id: %s", id)
  166. }
  167. }
  168. return result
  169. }
  170. // PrometheusDiagnostics is a PrometheusDiagnostic container with helper methods.
  171. type PrometheusDiagnostics []*PrometheusDiagnostic
  172. // HasFailure returns true if any of the diagnostic tests didn't pass.
  173. func (pd PrometheusDiagnostics) HasFailure() bool {
  174. for _, p := range pd {
  175. if !p.Passed {
  176. return true
  177. }
  178. }
  179. return false
  180. }
  181. // diagnosticDefinition is a definition of a diagnostic that can be used to create new
  182. // PrometheusDiagnostic instances using the definition's fields.
  183. type diagnosticDefinition struct {
  184. ID string
  185. QueryFmt string
  186. Label string
  187. Description string
  188. DocLink string
  189. }
  190. // NewDiagnostic creates a new PrometheusDiagnostic instance using the provided definition data.
  191. func (pdd *diagnosticDefinition) NewDiagnostic(offset string) *PrometheusDiagnostic {
  192. // FIXME: Any reasonable way to get the total number of replacements required in the query?
  193. // FIXME: All of the other queries require a single offset replace, but CPUThrottle requires two.
  194. var query string
  195. if pdd.ID == CPUThrottlingDiagnosticMetricID {
  196. query = fmt.Sprintf(pdd.QueryFmt, offset, offset)
  197. } else {
  198. query = fmt.Sprintf(pdd.QueryFmt, offset)
  199. }
  200. return &PrometheusDiagnostic{
  201. ID: pdd.ID,
  202. Query: query,
  203. Label: pdd.Label,
  204. Description: pdd.Description,
  205. DocLink: pdd.DocLink,
  206. }
  207. }
  208. // PrometheusDiagnostic holds information about a metric and the query to ensure it is functional
  209. type PrometheusDiagnostic struct {
  210. ID string `json:"id"`
  211. Query string `json:"query"`
  212. Label string `json:"label"`
  213. Description string `json:"description"`
  214. DocLink string `json:"docLink"`
  215. Result []*QueryResult `json:"result"`
  216. Passed bool `json:"passed"`
  217. }
  218. // executePrometheusDiagnosticQuery executes a PrometheusDiagnostic query using the given context
  219. func (pd *PrometheusDiagnostic) executePrometheusDiagnosticQuery(ctx *Context) error {
  220. resultCh := ctx.Query(pd.Query)
  221. result, err := resultCh.Await()
  222. if err != nil {
  223. return fmt.Errorf("prometheus diagnostic %s failed with error: %s", pd.ID, err)
  224. }
  225. if result == nil {
  226. result = []*QueryResult{}
  227. }
  228. pd.Result = result
  229. pd.Passed = len(result) == 0
  230. return nil
  231. }