jobmetrics.go 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. package metrics
  2. import (
  3. "github.com/opencost/opencost/pkg/clustercache"
  4. "github.com/prometheus/client_golang/prometheus"
  5. dto "github.com/prometheus/client_model/go"
  6. batchv1 "k8s.io/api/batch/v1"
  7. )
  8. var (
  9. jobFailureReasons = []string{"BackoffLimitExceeded", "DeadLineExceeded", "Evicted"}
  10. )
  11. //--------------------------------------------------------------------------
  12. // KubeJobCollector
  13. //--------------------------------------------------------------------------
  14. // KubeJobCollector is a prometheus collector that generates job sourced metrics.
  15. type KubeJobCollector struct {
  16. KubeClusterCache clustercache.ClusterCache
  17. metricsConfig MetricsConfig
  18. }
  19. // Describe sends the super-set of all possible descriptors of metrics
  20. // collected by this Collector.
  21. func (kjc KubeJobCollector) Describe(ch chan<- *prometheus.Desc) {
  22. disabledMetrics := kjc.metricsConfig.GetDisabledMetricsMap()
  23. if _, disabled := disabledMetrics["kube_pod_annotations"]; disabled {
  24. return
  25. }
  26. ch <- prometheus.NewDesc("kube_job_status_failed", "The number of pods which reached Phase Failed and the reason for failure.", []string{}, nil)
  27. }
  28. // Collect is called by the Prometheus registry when collecting metrics.
  29. func (kjc KubeJobCollector) Collect(ch chan<- prometheus.Metric) {
  30. disabledMetrics := kjc.metricsConfig.GetDisabledMetricsMap()
  31. if _, disabled := disabledMetrics["kube_pod_annotations"]; disabled {
  32. return
  33. }
  34. jobs := kjc.KubeClusterCache.GetAllJobs()
  35. for _, job := range jobs {
  36. jobName := job.GetName()
  37. jobNS := job.GetNamespace()
  38. if job.Status.Failed == 0 {
  39. ch <- newKubeJobStatusFailedMetric(jobName, jobNS, "kube_job_status_failed", "", 0)
  40. } else {
  41. for _, condition := range job.Status.Conditions {
  42. if condition.Type == batchv1.JobFailed {
  43. reasonKnown := false
  44. for _, reason := range jobFailureReasons {
  45. reasonKnown = reasonKnown || failureReason(&condition, reason)
  46. ch <- newKubeJobStatusFailedMetric(jobName, jobNS, "kube_job_status_failed", reason, boolFloat64(failureReason(&condition, reason)))
  47. }
  48. // for unknown reasons
  49. if !reasonKnown {
  50. ch <- newKubeJobStatusFailedMetric(jobName, jobNS, "kube_job_status_failed", "", float64(job.Status.Failed))
  51. }
  52. }
  53. }
  54. }
  55. }
  56. }
  57. //--------------------------------------------------------------------------
  58. // KubeJobStatusFailedMetric
  59. //--------------------------------------------------------------------------
  60. // KubeJobStatusFailedMetric
  61. type KubeJobStatusFailedMetric struct {
  62. fqName string
  63. help string
  64. job string
  65. namespace string
  66. reason string
  67. value float64
  68. }
  69. // Creates a new KubeJobStatusFailedMetric, implementation of prometheus.Metric
  70. func newKubeJobStatusFailedMetric(job, namespace, fqName, reason string, value float64) KubeJobStatusFailedMetric {
  71. return KubeJobStatusFailedMetric{
  72. fqName: fqName,
  73. help: "kube_job_status_failed Failed job",
  74. job: job,
  75. namespace: namespace,
  76. reason: reason,
  77. value: value,
  78. }
  79. }
  80. // Desc returns the descriptor for the Metric. This method idempotently
  81. // returns the same descriptor throughout the lifetime of the Metric.
  82. func (kjsf KubeJobStatusFailedMetric) Desc() *prometheus.Desc {
  83. l := prometheus.Labels{
  84. "job_name": kjsf.job,
  85. "namespace": kjsf.namespace,
  86. "reason": kjsf.reason,
  87. }
  88. return prometheus.NewDesc(kjsf.fqName, kjsf.help, []string{}, l)
  89. }
  90. // Write encodes the Metric into a "Metric" Protocol Buffer data
  91. // transmission object.
  92. func (kjsf KubeJobStatusFailedMetric) Write(m *dto.Metric) error {
  93. m.Gauge = &dto.Gauge{
  94. Value: &kjsf.value,
  95. }
  96. m.Label = []*dto.LabelPair{
  97. {
  98. Name: toStringPtr("job_name"),
  99. Value: &kjsf.job,
  100. },
  101. {
  102. Name: toStringPtr("namespace"),
  103. Value: &kjsf.namespace,
  104. },
  105. {
  106. Name: toStringPtr("reason"),
  107. Value: &kjsf.reason,
  108. },
  109. }
  110. return nil
  111. }