jobmetrics.go 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. package metrics
  2. import (
  3. "github.com/opencost/opencost/core/pkg/clustercache"
  4. "github.com/prometheus/client_golang/prometheus"
  5. dto "github.com/prometheus/client_model/go"
  6. batchv1 "k8s.io/api/batch/v1"
  7. )
  8. var (
  9. jobFailureReasons = []string{"BackoffLimitExceeded", "DeadLineExceeded", "Evicted"}
  10. )
  11. //--------------------------------------------------------------------------
  12. // KubeJobCollector
  13. //--------------------------------------------------------------------------
  14. // KubeJobCollector is a prometheus collector that generates job sourced metrics.
  15. type KubeJobCollector struct {
  16. KubeClusterCache clustercache.ClusterCache
  17. metricsConfig MetricsConfig
  18. }
  19. // Describe sends the super-set of all possible descriptors of metrics
  20. // collected by this Collector.
  21. func (kjc KubeJobCollector) Describe(ch chan<- *prometheus.Desc) {
  22. disabledMetrics := kjc.metricsConfig.GetDisabledMetricsMap()
  23. if _, disabled := disabledMetrics["kube_job_status_failed"]; disabled {
  24. return
  25. }
  26. ch <- prometheus.NewDesc("kube_job_status_failed", "The number of pods which reached Phase Failed and the reason for failure.", []string{}, nil)
  27. }
  28. // Collect is called by the Prometheus registry when collecting metrics.
  29. func (kjc KubeJobCollector) Collect(ch chan<- prometheus.Metric) {
  30. disabledMetrics := kjc.metricsConfig.GetDisabledMetricsMap()
  31. if _, disabled := disabledMetrics["kube_job_status_failed"]; disabled {
  32. return
  33. }
  34. jobs := kjc.KubeClusterCache.GetAllJobs()
  35. for _, job := range jobs {
  36. jobName := job.Name
  37. jobNS := job.Namespace
  38. jobUID := string(job.UID)
  39. if job.Status.Failed == 0 {
  40. ch <- newKubeJobStatusFailedMetric(jobName, jobNS, jobUID, "kube_job_status_failed", "", 0)
  41. } else {
  42. for _, condition := range job.Status.Conditions {
  43. if condition.Type == batchv1.JobFailed {
  44. reasonKnown := false
  45. for _, reason := range jobFailureReasons {
  46. reasonKnown = reasonKnown || failureReason(&condition, reason)
  47. ch <- newKubeJobStatusFailedMetric(jobName, jobNS, jobUID, "kube_job_status_failed", reason, boolFloat64(failureReason(&condition, reason)))
  48. }
  49. // for unknown reasons
  50. if !reasonKnown {
  51. ch <- newKubeJobStatusFailedMetric(jobName, jobNS, jobUID, "kube_job_status_failed", "", float64(job.Status.Failed))
  52. }
  53. }
  54. }
  55. }
  56. }
  57. }
  58. //--------------------------------------------------------------------------
  59. // KubeJobStatusFailedMetric
  60. //--------------------------------------------------------------------------
  61. // KubeJobStatusFailedMetric
  62. type KubeJobStatusFailedMetric struct {
  63. fqName string
  64. help string
  65. job string
  66. namespace string
  67. uid string
  68. reason string
  69. value float64
  70. }
  71. // Creates a new KubeJobStatusFailedMetric, implementation of prometheus.Metric
  72. func newKubeJobStatusFailedMetric(job, namespace, uid, fqName, reason string, value float64) KubeJobStatusFailedMetric {
  73. return KubeJobStatusFailedMetric{
  74. fqName: fqName,
  75. help: "kube_job_status_failed Failed job",
  76. job: job,
  77. namespace: namespace,
  78. uid: uid,
  79. reason: reason,
  80. value: value,
  81. }
  82. }
  83. // Desc returns the descriptor for the Metric. This method idempotently
  84. // returns the same descriptor throughout the lifetime of the Metric.
  85. func (kjsf KubeJobStatusFailedMetric) Desc() *prometheus.Desc {
  86. l := prometheus.Labels{
  87. "job_name": kjsf.job,
  88. "namespace": kjsf.namespace,
  89. "uid": kjsf.uid,
  90. "reason": kjsf.reason,
  91. }
  92. return prometheus.NewDesc(kjsf.fqName, kjsf.help, []string{}, l)
  93. }
  94. // Write encodes the Metric into a "Metric" Protocol Buffer data
  95. // transmission object.
  96. func (kjsf KubeJobStatusFailedMetric) Write(m *dto.Metric) error {
  97. m.Gauge = &dto.Gauge{
  98. Value: &kjsf.value,
  99. }
  100. m.Label = []*dto.LabelPair{
  101. {
  102. Name: toStringPtr("job_name"),
  103. Value: &kjsf.job,
  104. },
  105. {
  106. Name: toStringPtr("namespace"),
  107. Value: &kjsf.namespace,
  108. },
  109. {
  110. Name: toStringPtr("uid"),
  111. Value: &kjsf.uid,
  112. },
  113. {
  114. Name: toStringPtr("reason"),
  115. Value: &kjsf.reason,
  116. },
  117. }
  118. return nil
  119. }