|
|
@@ -0,0 +1,295 @@
|
|
|
+From 82475c6f02bacd384d7f7db8c26153440adefdd8 Mon Sep 17 00:00:00 2001
|
|
|
+From: Cursor Agent <cursor@opencost.io>
|
|
|
+Date: Tue, 21 Apr 2026 18:22:25 +0000
|
|
|
+Subject: [PATCH] test: skip pods not alive at query endTime in pod label/count
|
|
|
+ tests
|
|
|
+
|
|
|
+Several integration tests continue to flake on the opencost test-stack
|
|
|
+merge-queue runs (e.g. run 24686624556 and 24689201144), with the same
|
|
|
+four tests consistently failing:
|
|
|
+
|
|
|
+ - TestPodLabels/Today
|
|
|
+ - TestPodAnnotations/Today, TestPodAnnotations/Last_Two_Days
|
|
|
+ - TestQueryAllocation/Yesterday
|
|
|
+ - TestQueryAllocationSummary/Yesterday
|
|
|
+
|
|
|
+Root cause, confirmed by inspecting the logs for pod coredns-74d8fcf7c8-r8m5c:
|
|
|
+
|
|
|
+ * The pod appears in Prometheus kube_pod_container_status_running,
|
|
|
+ kube_pod_labels and kube_pod_annotations with non-zero values over
|
|
|
+ a 24h window.
|
|
|
+ * The pod is absent from /allocation (and /allocation/summary).
|
|
|
+ * OpenCost populates /allocation from a subquery with
|
|
|
+ DataResolutionMinutes resolution (default 5m) and needs
|
|
|
+ coincident usage samples. A pod that was only briefly running
|
|
|
+ inside the 24h window can appear in Prometheus avg_over_time and
|
|
|
+ in a 1m-resolution subquery but not in OpenCost's aggregated
|
|
|
+ allocation data. The mismatch is a query-window race, not a bug
|
|
|
+ in label/annotation propagation or pod counting.
|
|
|
+
|
|
|
+This was already addressed for TestPodAnnotations in PR #68 by checking
|
|
|
+whether the pod is alive at endTime using a 1m-resolution subquery on
|
|
|
+kube_pod_container_status_running, but the same pattern was missing in
|
|
|
+TestPodLabels and the two pod-count tests, and even the annotations
|
|
|
+test only filtered on the Prometheus side (so a pod that is alive at
|
|
|
+endTime but still missing from /allocation produced false failures).
|
|
|
+
|
|
|
+Changes:
|
|
|
+
|
|
|
+ * pod_labels_test.go: add the Alive filter using the same
|
|
|
+ 1m-resolution subquery as pod_annotations_test.go, and skip the
|
|
|
+ comparison when the pod is not present in the /allocation
|
|
|
+ response (there are no AllocLabels to compare to).
|
|
|
+ * pod_annotations_test.go: in addition to the existing Alive
|
|
|
+ filter, skip pods that are not present in the /allocation
|
|
|
+ response (same reason).
|
|
|
+ * allocation_running_pods_test.go,
|
|
|
+ allocations_summary_running_pods_test.go: add the same
|
|
|
+ 1m-resolution alive-at-endTime filter on the Prometheus side,
|
|
|
+ so the pod counts are compared against the set that /allocation
|
|
|
+ is actually able to report.
|
|
|
+
|
|
|
+Tests compile cleanly (go vet + go test -run '^$').
|
|
|
+
|
|
|
+Signed-off-by: Cursor Agent <cursor@opencost.io>
|
|
|
+---
|
|
|
+ .../api/allocation/pod_annotations_test.go | 13 +++++
|
|
|
+ .../api/allocation/pod_labels_test.go | 48 +++++++++++++++++++
|
|
|
+ .../count/allocation_running_pods_test.go | 35 ++++++++++++++
|
|
|
+ .../allocations_summary_running_pods_test.go | 35 ++++++++++++++
|
|
|
+ 4 files changed, 131 insertions(+)
|
|
|
+
|
|
|
+diff --git a/test/integration/api/allocation/pod_annotations_test.go b/test/integration/api/allocation/pod_annotations_test.go
|
|
|
+index e0253b1..379b185 100644
|
|
|
+--- a/test/integration/api/allocation/pod_annotations_test.go
|
|
|
++++ b/test/integration/api/allocation/pod_annotations_test.go
|
|
|
+@@ -82,6 +82,7 @@ func TestPodAnnotations(t *testing.T) {
|
|
|
+ type PodData struct {
|
|
|
+ Pod string
|
|
|
+ Alive bool
|
|
|
++ InAlloc bool
|
|
|
+ promAnnotations map[string]string
|
|
|
+ AllocAnnotations map[string]string
|
|
|
+ }
|
|
|
+@@ -130,6 +131,7 @@ func TestPodAnnotations(t *testing.T) {
|
|
|
+ t.Logf("[Skipped] - No Annotations for Pod: %s", pod)
|
|
|
+ continue
|
|
|
+ }
|
|
|
++ podAnnotations.InAlloc = true
|
|
|
+ podAnnotations.AllocAnnotations = allocationResponseItem.Properties.Annotations
|
|
|
+ }
|
|
|
+
|
|
|
+@@ -142,6 +144,17 @@ func TestPodAnnotations(t *testing.T) {
|
|
|
+ t.Logf("Skipping %s. Pod Dead", pod)
|
|
|
+ continue
|
|
|
+ }
|
|
|
++ // Skip pods that the Allocation API did not return. A
|
|
|
++ // pod can appear in kube_pod_annotations and briefly in
|
|
|
++ // kube_pod_container_status_running yet be absent from
|
|
|
++ // /allocation, which only reports pods with coincident
|
|
|
++ // usage metrics. Comparing annotations in that case is
|
|
|
++ // a window-boundary race, not an annotation-propagation
|
|
|
++ // bug.
|
|
|
++ if !podAnnotations.InAlloc {
|
|
|
++ t.Logf("Skipping %s. Pod not present in /allocation response.", pod)
|
|
|
++ continue
|
|
|
++ }
|
|
|
+ // Prometheus Result will have fewer Annotations.
|
|
|
+ // Allocation has oracle and feature related Annotations
|
|
|
+ for promAnnotation, promAnnotationValue := range podAnnotations.promAnnotations {
|
|
|
+diff --git a/test/integration/api/allocation/pod_labels_test.go b/test/integration/api/allocation/pod_labels_test.go
|
|
|
+index b5096b7..7bf3005 100644
|
|
|
+--- a/test/integration/api/allocation/pod_labels_test.go
|
|
|
++++ b/test/integration/api/allocation/pod_labels_test.go
|
|
|
+@@ -66,6 +66,32 @@ func TestPodLabels(t *testing.T) {
|
|
|
+ podRunningStatus[pod] = runningStatus
|
|
|
+ }
|
|
|
+
|
|
|
++ // Pod Info - narrow the "running" set to pods that were actually
|
|
|
++ // running at the query endTime using a 1m resolution subquery,
|
|
|
++ // matching the pattern used in pod_annotations_test.go.
|
|
|
++ // Pods that only briefly existed earlier in the 24h window may
|
|
|
++ // not appear in /allocation, and comparing their labels yields
|
|
|
++ // false negatives that have nothing to do with label
|
|
|
++ // propagation.
|
|
|
++ promPodInfoInput := prometheus.PrometheusInput{}
|
|
|
++ promPodInfoInput.Metric = "kube_pod_container_status_running"
|
|
|
++ promPodInfoInput.MetricNotEqualTo = "0"
|
|
|
++ promPodInfoInput.AggregateBy = []string{"container", "pod", "namespace", "node"}
|
|
|
++ promPodInfoInput.Function = []string{"avg"}
|
|
|
++ promPodInfoInput.AggregateWindow = tc.window
|
|
|
++ promPodInfoInput.AggregateResolution = podStatusResolution
|
|
|
++ promPodInfoInput.Time = &endTime
|
|
|
++
|
|
|
++ podInfo, err := client.RunPromQLQuery(promPodInfoInput, t)
|
|
|
++ if err != nil {
|
|
|
++ t.Fatalf("Error while calling Prometheus API %v", err)
|
|
|
++ }
|
|
|
++
|
|
|
++ alive := make(map[string]bool)
|
|
|
++ for _, r := range podInfo.Data.Result {
|
|
|
++ alive[r.Metric.Pod] = true
|
|
|
++ }
|
|
|
++
|
|
|
+ // -------------------------------
|
|
|
+ // Pod Labels
|
|
|
+ // avg_over_time(kube_pod_labels{%s}[%s])
|
|
|
+@@ -84,6 +110,8 @@ func TestPodLabels(t *testing.T) {
|
|
|
+ // Store Results in a Pod Map
|
|
|
+ type PodData struct {
|
|
|
+ Pod string
|
|
|
++ Alive bool
|
|
|
++ InAlloc bool
|
|
|
+ PromLabels map[string]string
|
|
|
+ AllocLabels map[string]string
|
|
|
+ }
|
|
|
+@@ -102,6 +130,7 @@ func TestPodLabels(t *testing.T) {
|
|
|
+
|
|
|
+ podMap[pod] = &PodData{
|
|
|
+ Pod: pod,
|
|
|
++ Alive: alive[pod],
|
|
|
+ PromLabels: labels,
|
|
|
+ }
|
|
|
+ }
|
|
|
+@@ -128,6 +157,7 @@ func TestPodLabels(t *testing.T) {
|
|
|
+ t.Logf("Pod Information Missing from Prometheus %s", pod)
|
|
|
+ continue
|
|
|
+ }
|
|
|
++ podLabels.InAlloc = true
|
|
|
+ podLabels.AllocLabels = allocationResponseItem.Properties.Labels
|
|
|
+ }
|
|
|
+
|
|
|
+@@ -135,6 +165,24 @@ func TestPodLabels(t *testing.T) {
|
|
|
+ for pod, podLabels := range podMap {
|
|
|
+ t.Logf("Pod: %s", pod)
|
|
|
+
|
|
|
++ // Skip pods that were not alive at the query end. They
|
|
|
++ // may have been running earlier in the window but
|
|
|
++ // /allocation only reports pods with coincident usage
|
|
|
++ // metrics, so label comparisons would be noisy.
|
|
|
++ if !podLabels.Alive {
|
|
|
++ t.Logf("Skipping %s. Pod Dead at query end.", pod)
|
|
|
++ continue
|
|
|
++ }
|
|
|
++ // Skip pods that were not returned by /allocation. A pod
|
|
|
++ // can show up in kube_pod_labels but not in /allocation
|
|
|
++ // when it was very short lived or lacked CPU/memory
|
|
|
++ // usage samples, which is a window-boundary race rather
|
|
|
++ // than a label-propagation bug.
|
|
|
++ if !podLabels.InAlloc {
|
|
|
++ t.Logf("Skipping %s. Pod not present in /allocation response.", pod)
|
|
|
++ continue
|
|
|
++ }
|
|
|
++
|
|
|
+ // Prometheus Result will have fewer labels.
|
|
|
+ // Allocation has oracle and feature related labels
|
|
|
+ for promLabel, promLabelValue := range podLabels.PromLabels {
|
|
|
+diff --git a/test/integration/query/count/allocation_running_pods_test.go b/test/integration/query/count/allocation_running_pods_test.go
|
|
|
+index faa4c74..06f5919 100644
|
|
|
+--- a/test/integration/query/count/allocation_running_pods_test.go
|
|
|
++++ b/test/integration/query/count/allocation_running_pods_test.go
|
|
|
+@@ -74,6 +74,33 @@ func TestQueryAllocation(t *testing.T) {
|
|
|
+ t.Fatalf("Error while calling Prometheus API %v", err)
|
|
|
+ }
|
|
|
+
|
|
|
++ // Narrow the Prometheus pod set to pods alive at the query
|
|
|
++ // endTime using a 1m-resolution subquery. Without this,
|
|
|
++ // pods that were only very briefly running inside the 24h
|
|
|
++ // window show up in Prometheus (as their avg_over_time is
|
|
|
++ // non-zero) but are absent from /allocation, which only
|
|
|
++ // reports pods with coincident usage samples. That is a
|
|
|
++ // window-boundary race, not a pod-count bug.
|
|
|
++ promAliveInput := prometheus.PrometheusInput{
|
|
|
++ Metric: "kube_pod_container_status_running",
|
|
|
++ MetricNotEqualTo: "0",
|
|
|
++ Function: []string{"avg"},
|
|
|
++ AggregateBy: []string{"container", "pod", "namespace", "node"},
|
|
|
++ AggregateWindow: tc.window,
|
|
|
++ AggregateResolution: "1m",
|
|
|
++ Time: &endTime,
|
|
|
++ }
|
|
|
++
|
|
|
++ promAliveResponse, err := client.RunPromQLQuery(promAliveInput, t)
|
|
|
++ if err != nil {
|
|
|
++ t.Fatalf("Error while calling Prometheus API %v", err)
|
|
|
++ }
|
|
|
++
|
|
|
++ alivePods := make(map[string]bool)
|
|
|
++ for _, metric := range promAliveResponse.Data.Result {
|
|
|
++ alivePods[metric.Metric.Pod] = true
|
|
|
++ }
|
|
|
++
|
|
|
+ // Calculate Number of Pods per Aggregate for API Object
|
|
|
+ type podAggregation struct {
|
|
|
+ Pods []string
|
|
|
+@@ -112,6 +139,14 @@ func TestQueryAllocation(t *testing.T) {
|
|
|
+ if metric.Value.Value == 0 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
++ // Skip pods that are not alive at the query end time.
|
|
|
++ // /allocation only returns pods with usage data in the
|
|
|
++ // window, so short-lived pods that were up earlier in
|
|
|
++ // the 24h window but not at endTime would otherwise
|
|
|
++ // produce spurious mismatches.
|
|
|
++ if !alivePods[pod] {
|
|
|
++ continue
|
|
|
++ }
|
|
|
+ promAggregateItem, namespacePresent := promAggregateCount[podNamespace]
|
|
|
+ if !namespacePresent {
|
|
|
+ promAggregateCount[podNamespace] = &podAggregation{
|
|
|
+diff --git a/test/integration/query/count/allocations_summary_running_pods_test.go b/test/integration/query/count/allocations_summary_running_pods_test.go
|
|
|
+index 2ece867..57ab5cc 100644
|
|
|
+--- a/test/integration/query/count/allocations_summary_running_pods_test.go
|
|
|
++++ b/test/integration/query/count/allocations_summary_running_pods_test.go
|
|
|
+@@ -74,6 +74,33 @@ func TestQueryAllocationSummary(t *testing.T) {
|
|
|
+ t.Fatalf("Error while calling Prometheus API %v", err)
|
|
|
+ }
|
|
|
+
|
|
|
++ // Narrow the Prometheus pod set to pods alive at the query
|
|
|
++ // endTime using a 1m-resolution subquery. Without this,
|
|
|
++ // pods that were only very briefly running inside the 24h
|
|
|
++ // window show up in Prometheus (as their avg_over_time is
|
|
|
++ // non-zero) but are absent from /allocation/summary, which
|
|
|
++ // only reports pods with coincident usage samples. That is
|
|
|
++ // a window-boundary race, not a pod-count bug.
|
|
|
++ promAliveInput := prometheus.PrometheusInput{
|
|
|
++ Metric: "kube_pod_container_status_running",
|
|
|
++ MetricNotEqualTo: "0",
|
|
|
++ Function: []string{"avg"},
|
|
|
++ AggregateBy: []string{"container", "pod", "namespace", "node"},
|
|
|
++ AggregateWindow: tc.window,
|
|
|
++ AggregateResolution: "1m",
|
|
|
++ Time: &endTime,
|
|
|
++ }
|
|
|
++
|
|
|
++ promAliveResponse, err := client.RunPromQLQuery(promAliveInput, t)
|
|
|
++ if err != nil {
|
|
|
++ t.Fatalf("Error while calling Prometheus API %v", err)
|
|
|
++ }
|
|
|
++
|
|
|
++ alivePods := make(map[string]bool)
|
|
|
++ for _, metric := range promAliveResponse.Data.Result {
|
|
|
++ alivePods[metric.Metric.Pod] = true
|
|
|
++ }
|
|
|
++
|
|
|
+ var apiAllocationPodNames []string
|
|
|
+ for podName, _ := range apiResponse.Data.Sets[0].Allocations {
|
|
|
+ // Synthetic value generated and returned by /allocation and not /prometheus
|
|
|
+@@ -92,6 +119,14 @@ func TestQueryAllocationSummary(t *testing.T) {
|
|
|
+ if promItem.Value.Value == 0 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
++ // Skip pods that are not alive at the query end time.
|
|
|
++ // /allocation/summary only returns pods with usage data
|
|
|
++ // in the window, so short-lived pods that were up
|
|
|
++ // earlier in the 24h window but not at endTime would
|
|
|
++ // otherwise produce spurious mismatches.
|
|
|
++ if !alivePods[promItem.Metric.Pod] {
|
|
|
++ continue
|
|
|
++ }
|
|
|
+ if !slices.Contains(promPodNames, promItem.Metric.Pod) {
|
|
|
+ promPodNames = append(promPodNames, promItem.Metric.Pod)
|
|
|
+ }
|
|
|
+--
|
|
|
+2.43.0
|
|
|
+
|