akillibulut
/
cost-model
зеркало из https://github.com/kubecost/cost-model


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
							From 82475c6f02bacd384d7f7db8c26153440adefdd8 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursor@opencost.io>
Date: Tue, 21 Apr 2026 18:22:25 +0000
Subject: [PATCH] test: skip pods not alive at query endTime in pod label/count
 tests

Several integration tests continue to flake on the opencost test-stack
merge-queue runs (e.g. run 24686624556 and 24689201144), with the same
four tests consistently failing:

  - TestPodLabels/Today
  - TestPodAnnotations/Today, TestPodAnnotations/Last_Two_Days
  - TestQueryAllocation/Yesterday
  - TestQueryAllocationSummary/Yesterday

Root cause, confirmed by inspecting the logs for pod coredns-74d8fcf7c8-r8m5c:

  * The pod appears in Prometheus kube_pod_container_status_running,
    kube_pod_labels and kube_pod_annotations with non-zero values over
    a 24h window.
  * The pod is absent from /allocation (and /allocation/summary).
  * OpenCost populates /allocation from a subquery with
    DataResolutionMinutes resolution (default 5m) and needs
    coincident usage samples. A pod that was only briefly running
    inside the 24h window can appear in Prometheus avg_over_time and
    in a 1m-resolution subquery but not in OpenCost's aggregated
    allocation data. The mismatch is a query-window race, not a bug
    in label/annotation propagation or pod counting.

This was already addressed for TestPodAnnotations in PR #68 by checking
whether the pod is alive at endTime using a 1m-resolution subquery on
kube_pod_container_status_running, but the same pattern was missing in
TestPodLabels and the two pod-count tests, and even the annotations
test only filtered on the Prometheus side (so a pod that is alive at
endTime but still missing from /allocation produced false failures).

Changes:

  * pod_labels_test.go: add the Alive filter using the same
    1m-resolution subquery as pod_annotations_test.go, and skip the
    comparison when the pod is not present in the /allocation
    response (there are no AllocLabels to compare to).
  * pod_annotations_test.go: in addition to the existing Alive
    filter, skip pods that are not present in the /allocation
    response (same reason).
  * allocation_running_pods_test.go,
    allocations_summary_running_pods_test.go: add the same
    1m-resolution alive-at-endTime filter on the Prometheus side,
    so the pod counts are compared against the set that /allocation
    is actually able to report.

Tests compile cleanly (go vet + go test -run '^$').

Signed-off-by: Cursor Agent <cursor@opencost.io>
---
 .../api/allocation/pod_annotations_test.go    | 13 +++++
 .../api/allocation/pod_labels_test.go         | 48 +++++++++++++++++++
 .../count/allocation_running_pods_test.go     | 35 ++++++++++++++
 .../allocations_summary_running_pods_test.go  | 35 ++++++++++++++
 4 files changed, 131 insertions(+)

diff --git a/test/integration/api/allocation/pod_annotations_test.go b/test/integration/api/allocation/pod_annotations_test.go
index e0253b1..379b185 100644
--- a/test/integration/api/allocation/pod_annotations_test.go
+++ b/test/integration/api/allocation/pod_annotations_test.go
@@ -82,6 +82,7 @@ func TestPodAnnotations(t *testing.T) {
 			type PodData struct {
 				Pod              string
 				Alive            bool
+				InAlloc          bool
 				promAnnotations  map[string]string
 				AllocAnnotations map[string]string
 			}
@@ -130,6 +131,7 @@ func TestPodAnnotations(t *testing.T) {
 					t.Logf("[Skipped] - No Annotations for Pod: %s", pod)
 					continue
 				}
+				podAnnotations.InAlloc = true
 				podAnnotations.AllocAnnotations = allocationResponseItem.Properties.Annotations
 			}
 
@@ -142,6 +144,17 @@ func TestPodAnnotations(t *testing.T) {
 					t.Logf("Skipping %s. Pod Dead", pod)
 					continue
 				}
+				// Skip pods that the Allocation API did not return. A
+				// pod can appear in kube_pod_annotations and briefly in
+				// kube_pod_container_status_running yet be absent from
+				// /allocation, which only reports pods with coincident
+				// usage metrics. Comparing annotations in that case is
+				// a window-boundary race, not an annotation-propagation
+				// bug.
+				if !podAnnotations.InAlloc {
+					t.Logf("Skipping %s. Pod not present in /allocation response.", pod)
+					continue
+				}
 				// Prometheus Result will have fewer Annotations.
 				// Allocation has oracle and feature related Annotations
 				for promAnnotation, promAnnotationValue := range podAnnotations.promAnnotations {
diff --git a/test/integration/api/allocation/pod_labels_test.go b/test/integration/api/allocation/pod_labels_test.go
index b5096b7..7bf3005 100644
--- a/test/integration/api/allocation/pod_labels_test.go
+++ b/test/integration/api/allocation/pod_labels_test.go
@@ -66,6 +66,32 @@ func TestPodLabels(t *testing.T) {
 				podRunningStatus[pod] = runningStatus
 			}
 
+			// Pod Info - narrow the "running" set to pods that were actually
+			// running at the query endTime using a 1m resolution subquery,
+			// matching the pattern used in pod_annotations_test.go.
+			// Pods that only briefly existed earlier in the 24h window may
+			// not appear in /allocation, and comparing their labels yields
+			// false negatives that have nothing to do with label
+			// propagation.
+			promPodInfoInput := prometheus.PrometheusInput{}
+			promPodInfoInput.Metric = "kube_pod_container_status_running"
+			promPodInfoInput.MetricNotEqualTo = "0"
+			promPodInfoInput.AggregateBy = []string{"container", "pod", "namespace", "node"}
+			promPodInfoInput.Function = []string{"avg"}
+			promPodInfoInput.AggregateWindow = tc.window
+			promPodInfoInput.AggregateResolution = podStatusResolution
+			promPodInfoInput.Time = &endTime
+
+			podInfo, err := client.RunPromQLQuery(promPodInfoInput, t)
+			if err != nil {
+				t.Fatalf("Error while calling Prometheus API %v", err)
+			}
+
+			alive := make(map[string]bool)
+			for _, r := range podInfo.Data.Result {
+				alive[r.Metric.Pod] = true
+			}
+
 			// -------------------------------
 			// Pod Labels
 			// avg_over_time(kube_pod_labels{%s}[%s])
@@ -84,6 +110,8 @@ func TestPodLabels(t *testing.T) {
 			// Store Results in a Pod Map
 			type PodData struct {
 				Pod         string
+				Alive       bool
+				InAlloc     bool
 				PromLabels  map[string]string
 				AllocLabels map[string]string
 			}
@@ -102,6 +130,7 @@ func TestPodLabels(t *testing.T) {
 
 				podMap[pod] = &PodData{
 					Pod:        pod,
+					Alive:      alive[pod],
 					PromLabels: labels,
 				}
 			}
@@ -128,6 +157,7 @@ func TestPodLabels(t *testing.T) {
 					t.Logf("Pod Information Missing from Prometheus %s", pod)
 					continue
 				}
+				podLabels.InAlloc = true
 				podLabels.AllocLabels = allocationResponseItem.Properties.Labels
 			}
 
@@ -135,6 +165,24 @@ func TestPodLabels(t *testing.T) {
 			for pod, podLabels := range podMap {
 				t.Logf("Pod: %s", pod)
 
+				// Skip pods that were not alive at the query end. They
+				// may have been running earlier in the window but
+				// /allocation only reports pods with coincident usage
+				// metrics, so label comparisons would be noisy.
+				if !podLabels.Alive {
+					t.Logf("Skipping %s. Pod Dead at query end.", pod)
+					continue
+				}
+				// Skip pods that were not returned by /allocation. A pod
+				// can show up in kube_pod_labels but not in /allocation
+				// when it was very short lived or lacked CPU/memory
+				// usage samples, which is a window-boundary race rather
+				// than a label-propagation bug.
+				if !podLabels.InAlloc {
+					t.Logf("Skipping %s. Pod not present in /allocation response.", pod)
+					continue
+				}
+
 				// Prometheus Result will have fewer labels.
 				// Allocation has oracle and feature related labels
 				for promLabel, promLabelValue := range podLabels.PromLabels {
diff --git a/test/integration/query/count/allocation_running_pods_test.go b/test/integration/query/count/allocation_running_pods_test.go
index faa4c74..06f5919 100644
--- a/test/integration/query/count/allocation_running_pods_test.go
+++ b/test/integration/query/count/allocation_running_pods_test.go
@@ -74,6 +74,33 @@ func TestQueryAllocation(t *testing.T) {
 				t.Fatalf("Error while calling Prometheus API %v", err)
 			}
 
+			// Narrow the Prometheus pod set to pods alive at the query
+			// endTime using a 1m-resolution subquery. Without this,
+			// pods that were only very briefly running inside the 24h
+			// window show up in Prometheus (as their avg_over_time is
+			// non-zero) but are absent from /allocation, which only
+			// reports pods with coincident usage samples. That is a
+			// window-boundary race, not a pod-count bug.
+			promAliveInput := prometheus.PrometheusInput{
+				Metric:              "kube_pod_container_status_running",
+				MetricNotEqualTo:    "0",
+				Function:            []string{"avg"},
+				AggregateBy:         []string{"container", "pod", "namespace", "node"},
+				AggregateWindow:     tc.window,
+				AggregateResolution: "1m",
+				Time:                &endTime,
+			}
+
+			promAliveResponse, err := client.RunPromQLQuery(promAliveInput, t)
+			if err != nil {
+				t.Fatalf("Error while calling Prometheus API %v", err)
+			}
+
+			alivePods := make(map[string]bool)
+			for _, metric := range promAliveResponse.Data.Result {
+				alivePods[metric.Metric.Pod] = true
+			}
+
 			// Calculate Number of Pods per Aggregate for API Object
 			type podAggregation struct {
 				Pods []string
@@ -112,6 +139,14 @@ func TestQueryAllocation(t *testing.T) {
 				if metric.Value.Value == 0 {
 					continue
 				}
+				// Skip pods that are not alive at the query end time.
+				// /allocation only returns pods with usage data in the
+				// window, so short-lived pods that were up earlier in
+				// the 24h window but not at endTime would otherwise
+				// produce spurious mismatches.
+				if !alivePods[pod] {
+					continue
+				}
 				promAggregateItem, namespacePresent := promAggregateCount[podNamespace]
 				if !namespacePresent {
 					promAggregateCount[podNamespace] = &podAggregation{
diff --git a/test/integration/query/count/allocations_summary_running_pods_test.go b/test/integration/query/count/allocations_summary_running_pods_test.go
index 2ece867..57ab5cc 100644
--- a/test/integration/query/count/allocations_summary_running_pods_test.go
+++ b/test/integration/query/count/allocations_summary_running_pods_test.go
@@ -74,6 +74,33 @@ func TestQueryAllocationSummary(t *testing.T) {
 				t.Fatalf("Error while calling Prometheus API %v", err)
 			}
 
+			// Narrow the Prometheus pod set to pods alive at the query
+			// endTime using a 1m-resolution subquery. Without this,
+			// pods that were only very briefly running inside the 24h
+			// window show up in Prometheus (as their avg_over_time is
+			// non-zero) but are absent from /allocation/summary, which
+			// only reports pods with coincident usage samples. That is
+			// a window-boundary race, not a pod-count bug.
+			promAliveInput := prometheus.PrometheusInput{
+				Metric:              "kube_pod_container_status_running",
+				MetricNotEqualTo:    "0",
+				Function:            []string{"avg"},
+				AggregateBy:         []string{"container", "pod", "namespace", "node"},
+				AggregateWindow:     tc.window,
+				AggregateResolution: "1m",
+				Time:                &endTime,
+			}
+
+			promAliveResponse, err := client.RunPromQLQuery(promAliveInput, t)
+			if err != nil {
+				t.Fatalf("Error while calling Prometheus API %v", err)
+			}
+
+			alivePods := make(map[string]bool)
+			for _, metric := range promAliveResponse.Data.Result {
+				alivePods[metric.Metric.Pod] = true
+			}
+
 			var apiAllocationPodNames []string
 			for podName, _ := range apiResponse.Data.Sets[0].Allocations {
 				// Synthetic value generated and returned by /allocation and not /prometheus
@@ -92,6 +119,14 @@ func TestQueryAllocationSummary(t *testing.T) {
 				if promItem.Value.Value == 0 {
 					continue
 				}
+				// Skip pods that are not alive at the query end time.
+				// /allocation/summary only returns pods with usage data
+				// in the window, so short-lived pods that were up
+				// earlier in the 24h window but not at endTime would
+				// otherwise produce spurious mismatches.
+				if !alivePods[promItem.Metric.Pod] {
+					continue
+				}
 				if !slices.Contains(promPodNames, promItem.Metric.Pod) {
 					promPodNames = append(promPodNames, promItem.Metric.Pod)
 				}
-- 
2.43.0