pod_labels_test.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. package allocation
  2. // Description
  3. // Check Pod Labels from API Match results from Promethues
  4. import (
  5. "testing"
  6. "time"
  7. "github.com/opencost/opencost-integration-tests/pkg/api"
  8. "github.com/opencost/opencost-integration-tests/pkg/prometheus"
  9. )
  10. func TestPodLabels(t *testing.T) {
  11. apiObj := api.NewAPI()
  12. testCases := []struct {
  13. name string
  14. window string
  15. aggregate string
  16. accumulate string
  17. includeAggregatedMetadata string
  18. }{
  19. {
  20. name: "Today",
  21. window: "24h",
  22. aggregate: "pod",
  23. accumulate: "true",
  24. includeAggregatedMetadata: "true",
  25. },
  26. }
  27. t.Logf("testCases: %v", testCases)
  28. for _, tc := range testCases {
  29. t.Run(tc.name, func(t *testing.T) {
  30. queryEnd := time.Now().UTC().Truncate(time.Hour).Add(time.Hour)
  31. endTime := queryEnd.Unix()
  32. // -------------------------------
  33. // Pod Running Time
  34. // avg(avg_over_time(kube_pod_container_status_running{%s}[%s])) by (pod)
  35. // -------------------------------
  36. client := prometheus.NewClient()
  37. promPodRunningInfoInput := prometheus.PrometheusInput{}
  38. promPodRunningInfoInput.Metric = "kube_pod_container_status_running"
  39. promPodRunningInfoInput.Function = []string{"avg_over_time", "avg"}
  40. promPodRunningInfoInput.QueryWindow = tc.window
  41. promPodRunningInfoInput.AggregateBy = []string{"pod"}
  42. promPodRunningInfoInput.Time = &endTime
  43. promPodRunningInfo, err := client.RunPromQLQuery(promPodRunningInfoInput, t)
  44. if err != nil {
  45. t.Fatalf("Error while calling Prometheus API %v", err)
  46. }
  47. podRunningStatus := make(map[string]int)
  48. for _, promPodRunningInfoItem := range promPodRunningInfo.Data.Result {
  49. pod := promPodRunningInfoItem.Metric.Pod
  50. runningStatus := int(promPodRunningInfoItem.Value.Value)
  51. // kube_pod_labels and kube_nodespace_labels might hold labels for dead pods as well
  52. // filter the ones that are running because allocation filters for that
  53. podRunningStatus[pod] = runningStatus
  54. }
  55. // Pod Info - narrow the "running" set to pods that were actually
  56. // running at the query endTime using a 1m resolution subquery,
  57. // matching the pattern used in pod_annotations_test.go.
  58. // Pods that only briefly existed earlier in the 24h window may
  59. // not appear in /allocation, and comparing their labels yields
  60. // false negatives that have nothing to do with label
  61. // propagation.
  62. promPodInfoInput := prometheus.PrometheusInput{}
  63. promPodInfoInput.Metric = "kube_pod_container_status_running"
  64. promPodInfoInput.MetricNotEqualTo = "0"
  65. promPodInfoInput.AggregateBy = []string{"container", "pod", "namespace", "node"}
  66. promPodInfoInput.Function = []string{"avg"}
  67. promPodInfoInput.AggregateWindow = tc.window
  68. promPodInfoInput.AggregateResolution = podStatusResolution
  69. promPodInfoInput.Time = &endTime
  70. podInfo, err := client.RunPromQLQuery(promPodInfoInput, t)
  71. if err != nil {
  72. t.Fatalf("Error while calling Prometheus API %v", err)
  73. }
  74. alive := make(map[string]bool)
  75. for _, r := range podInfo.Data.Result {
  76. alive[r.Metric.Pod] = true
  77. }
  78. // -------------------------------
  79. // Pod Labels
  80. // avg_over_time(kube_pod_labels{%s}[%s])
  81. // -------------------------------
  82. promLabelInfoInput := prometheus.PrometheusInput{}
  83. promLabelInfoInput.Metric = "kube_pod_labels"
  84. promLabelInfoInput.Function = []string{"avg_over_time"}
  85. promLabelInfoInput.QueryWindow = tc.window
  86. promLabelInfoInput.Time = &endTime
  87. promlabelInfo, err := client.RunPromQLQuery(promLabelInfoInput, t)
  88. if err != nil {
  89. t.Fatalf("Error while calling Prometheus API %v", err)
  90. }
  91. // Store Results in a Pod Map
  92. type PodData struct {
  93. Pod string
  94. Alive bool
  95. InAlloc bool
  96. PromLabels map[string]string
  97. AllocLabels map[string]string
  98. }
  99. podMap := make(map[string]*PodData)
  100. // Store Prometheus Pod Prometheus Results
  101. for _, promlabel := range promlabelInfo.Data.Result {
  102. pod := promlabel.Metric.Pod
  103. labels := promlabel.Metric.Labels
  104. // Skip Dead Pods
  105. if podRunningStatus[pod] == 0 {
  106. continue
  107. }
  108. podMap[pod] = &PodData{
  109. Pod: pod,
  110. Alive: alive[pod],
  111. PromLabels: labels,
  112. }
  113. }
  114. // API Response
  115. apiResponse, err := apiObj.GetAllocation(api.AllocationRequest{
  116. Window: tc.window,
  117. Aggregate: tc.aggregate,
  118. Accumulate: tc.accumulate,
  119. IncludeAggregatedMetadata: tc.includeAggregatedMetadata,
  120. })
  121. if err != nil {
  122. t.Fatalf("Error while calling Allocation API %v", err)
  123. }
  124. if apiResponse.Code != 200 {
  125. t.Errorf("API returned non-200 code")
  126. }
  127. // Store Allocation Pod Label Results
  128. for pod, allocationResponseItem := range apiResponse.Data[0] {
  129. podLabels, ok := podMap[pod]
  130. if !ok {
  131. t.Logf("Pod Information Missing from Prometheus %s", pod)
  132. continue
  133. }
  134. podLabels.InAlloc = true
  135. podLabels.AllocLabels = allocationResponseItem.Properties.Labels
  136. }
  137. // Compare Results
  138. for pod, podLabels := range podMap {
  139. t.Logf("Pod: %s", pod)
  140. // Skip pods that were not alive at the query end. They
  141. // may have been running earlier in the window but
  142. // /allocation only reports pods with coincident usage
  143. // metrics, so label comparisons would be noisy.
  144. if !podLabels.Alive {
  145. t.Logf("Skipping %s. Pod Dead at query end.", pod)
  146. continue
  147. }
  148. // Skip pods that were not returned by /allocation. A pod
  149. // can show up in kube_pod_labels but not in /allocation
  150. // when it was very short lived or lacked CPU/memory
  151. // usage samples, which is a window-boundary race rather
  152. // than a label-propagation bug.
  153. if !podLabels.InAlloc {
  154. t.Logf("Skipping %s. Pod not present in /allocation response.", pod)
  155. continue
  156. }
  157. // Prometheus Result will have fewer labels.
  158. // Allocation has oracle and feature related labels
  159. for promLabel, promLabelValue := range podLabels.PromLabels {
  160. allocLabelValue, ok := podLabels.AllocLabels[promLabel]
  161. if !ok {
  162. t.Errorf(" - [Fail]: Prometheus Label %s not found in Allocation", promLabel)
  163. continue
  164. }
  165. if allocLabelValue != promLabelValue {
  166. t.Errorf(" - [Fail]: Alloc %s != Prom %s", allocLabelValue, promLabelValue)
  167. } else {
  168. t.Logf(" - [Pass]: Label: %s", promLabel)
  169. }
  170. }
  171. }
  172. })
  173. }
  174. }