Przeglądaj źródła

fix(costmodel): take max across duplicate rows in applyCPUCoresUsedMax/applyRAMBytesUsedMax/applyGPUUsageMax

QueryCPUUsageMax, QueryRAMUsageMax and QueryGPUsUsageMax use PromQL
queries of the form

  max(max_over_time(<metric>{...}[window])) by (
    container_name, container, pod_name, pod, namespace,
    node, instance, uid, <cluster_label>)

which groups by identifiers that are not part of the podKey used by
applyCPUCoresUsedMax, applyRAMBytesUsedMax, and applyGPUUsageMax. When
more than one row maps to the same (cluster, namespace, pod, container)
- e.g. because a pod restarted mid-window and kube-state-metrics emitted
rows under both the old and new uid, or because kubelet/cAdvisor was
scraped from multiple instances - these applier functions were
unconditionally overwriting the previously-stored value with
res.Data[0].Value of whichever row happened to be iterated last. Because
Go map iteration order is unspecified, the resulting max was arbitrary,
and in practice frequently came out orders of magnitude below the
correct value.

This reproduced on the opencost test stack in scheduled integration
runs (e.g. runs 24806697901 and 24784087571), where the
TestRAMMax/Last_Two_Days and TestRAMAvgUsage tests reported
DifferencePercent around 99% for a single pod (pod test-install-
speedtest-tracker-cnpg-main-1, /allocation reported 442368 bytes vs
Prometheus's 65101824 bytes), matching the expected pattern of a
StatefulSet pod that was restarted during the 48h window. The other
(non-restarted) replicas in the same StatefulSet passed.

Fix: aggregate the values for the same (cluster, namespace, pod,
container) key using math.Max (and, for the pointer-valued GPUUsageMax
field, an explicit > comparison that also covers the nil case). This
matches the semantics of the underlying query ('max' aggregator) and is
independent of result iteration order.

The RAM-avg and CPU-avg variants do not exhibit the same problem as a
simple fix because combining averages correctly requires weighting by
container run-time, and a wrong-but-deterministic value is typically
the safer fallback there; this commit leaves those paths untouched and
fixes only the 'max' aggregators, where math.Max is trivially correct.

Testing:
  * go vet ./... - clean
  * gofmt -l . - empty
  * go build ./... - clean
  * go test ./pkg/costmodel/... - all pass, including three new
    regression tests (one for each fixed function) asserting that the
    largest value wins regardless of iteration order.

Signed-off-by: Cursor Agent <cursor@opencost.io>

Co-authored-by: Alex Meijer <ameijer@users.noreply.github.com>
Cursor Agent 3 tygodni temu
rodzic
commit
a7a9dd6b7f

+ 37 - 4
pkg/costmodel/allocation_helpers.go

@@ -416,12 +416,23 @@ func applyCPUCoresUsedMax(podMap map[podKey]*pod, resCPUCoresUsedMax []*source.C
 				thisPod.appendContainer(container)
 			}
 
+			// Take the max across all result rows for the same
+			// (cluster, namespace, pod, container). The upstream query
+			// groups by additional labels (uid, instance, ...) and can
+			// return multiple rows for the same pod+container when, for
+			// example, a pod restarted mid-window (new uid) or was
+			// scraped from more than one instance. Prior to this, the
+			// last row iterated overwrote any earlier (possibly larger)
+			// value, which produced arbitrarily low maxima.
 			if thisPod.Allocations[container].RawAllocationOnly == nil {
 				thisPod.Allocations[container].RawAllocationOnly = &opencost.RawAllocationOnlyData{
 					CPUCoreUsageMax: res.Data[0].Value,
 				}
 			} else {
-				thisPod.Allocations[container].RawAllocationOnly.CPUCoreUsageMax = res.Data[0].Value
+				thisPod.Allocations[container].RawAllocationOnly.CPUCoreUsageMax = math.Max(
+					thisPod.Allocations[container].RawAllocationOnly.CPUCoreUsageMax,
+					res.Data[0].Value,
+				)
 			}
 		}
 	}
@@ -665,12 +676,23 @@ func applyRAMBytesUsedMax(podMap map[podKey]*pod, resRAMBytesUsedMax []*source.R
 				thisPod.appendContainer(container)
 			}
 
+			// Take the max across all result rows for the same
+			// (cluster, namespace, pod, container). The upstream query
+			// groups by additional labels (uid, instance, ...) and can
+			// return multiple rows for the same pod+container when, for
+			// example, a pod restarted mid-window (new uid) or was
+			// scraped from more than one instance. Prior to this, the
+			// last row iterated overwrote any earlier (possibly larger)
+			// value, which produced arbitrarily low maxima.
 			if thisPod.Allocations[container].RawAllocationOnly == nil {
 				thisPod.Allocations[container].RawAllocationOnly = &opencost.RawAllocationOnlyData{
 					RAMBytesUsageMax: res.Data[0].Value,
 				}
 			} else {
-				thisPod.Allocations[container].RawAllocationOnly.RAMBytesUsageMax = res.Data[0].Value
+				thisPod.Allocations[container].RawAllocationOnly.RAMBytesUsageMax = math.Max(
+					thisPod.Allocations[container].RawAllocationOnly.RAMBytesUsageMax,
+					res.Data[0].Value,
+				)
 			}
 		}
 	}
@@ -757,12 +779,23 @@ func applyGPUUsageMax(podMap map[podKey]*pod, resGPUUsageMax []*source.GPUsUsage
 				thisPod.appendContainer(container)
 			}
 
+			// Take the max across all result rows for the same
+			// (cluster, namespace, pod, container). The upstream query
+			// groups by additional labels (uid, device, ...) and can
+			// return multiple rows for the same pod+container (for
+			// example a pod that restarted mid-window gets a new uid).
+			// Prior to this, the last row iterated overwrote any
+			// earlier (possibly larger) value.
+			v := res.Data[0].Value
 			if thisPod.Allocations[container].RawAllocationOnly == nil {
 				thisPod.Allocations[container].RawAllocationOnly = &opencost.RawAllocationOnlyData{
-					GPUUsageMax: &res.Data[0].Value,
+					GPUUsageMax: &v,
 				}
 			} else {
-				thisPod.Allocations[container].RawAllocationOnly.GPUUsageMax = &res.Data[0].Value
+				existing := thisPod.Allocations[container].RawAllocationOnly.GPUUsageMax
+				if existing == nil || v > *existing {
+					thisPod.Allocations[container].RawAllocationOnly.GPUUsageMax = &v
+				}
 			}
 		}
 	}

+ 201 - 0
pkg/costmodel/allocation_helpers_test.go

@@ -622,3 +622,204 @@ func TestCalculateStartAndEnd(t *testing.T) {
 		})
 	}
 }
+
+// makePodMapWithEmptyAllocations builds a pod map with the given podKey present
+// and an empty Allocations map, which mirrors what buildPodMap produces before
+// container-scoped apply functions run.
+func makePodMapWithEmptyAllocations(pk podKey) map[podKey]*pod {
+	return map[podKey]*pod{
+		pk: {
+			Window:      window.Clone(),
+			Start:       windowStart,
+			End:         windowEnd,
+			Key:         pk,
+			Allocations: map[string]*opencost.Allocation{},
+		},
+	}
+}
+
+// ramUsageMaxResult builds a RAMUsageMaxResult for a single container with the
+// given pod-level identity and sample value.
+func ramUsageMaxResult(pk podKey, container string, value float64) *source.RAMUsageMaxResult {
+	return &source.RAMUsageMaxResult{
+		Cluster:   pk.Cluster,
+		Namespace: pk.Namespace,
+		Pod:       pk.Pod,
+		Container: container,
+		Data: []*util.Vector{
+			{Value: value},
+		},
+	}
+}
+
+// cpuUsageMaxResult builds a CPUUsageMaxResult for a single container with the
+// given pod-level identity and sample value.
+func cpuUsageMaxResult(pk podKey, container string, value float64) *source.CPUUsageMaxResult {
+	return &source.CPUUsageMaxResult{
+		Cluster:   pk.Cluster,
+		Namespace: pk.Namespace,
+		Pod:       pk.Pod,
+		Container: container,
+		Data: []*util.Vector{
+			{Value: value},
+		},
+	}
+}
+
+// gpuUsageMaxResult builds a GPUsUsageMaxResult for a single container with
+// the given pod-level identity and sample value.
+func gpuUsageMaxResult(pk podKey, container string, value float64) *source.GPUsUsageMaxResult {
+	return &source.GPUsUsageMaxResult{
+		Cluster:   pk.Cluster,
+		Namespace: pk.Namespace,
+		Pod:       pk.Pod,
+		Container: container,
+		Data: []*util.Vector{
+			{Value: value},
+		},
+	}
+}
+
+// TestApplyRAMBytesUsedMax_KeepsLargestAcrossDuplicateRows regression-tests
+// the case where the Prometheus RAM max query returns multiple rows for the
+// same (cluster, namespace, pod, container) combination (for example when a
+// pod restarted mid-window and kube-state-metrics emitted a new uid row, or
+// when the pod was scraped from more than one instance). The previous
+// implementation overwrote the stored max with whichever row happened to be
+// iterated last, producing an arbitrary (and typically tiny) maximum. This
+// test asserts that applyRAMBytesUsedMax now takes the max across rows.
+func TestApplyRAMBytesUsedMax_KeepsLargestAcrossDuplicateRows(t *testing.T) {
+	const container = "c1"
+	const small = 442368.0
+	const large = 65101824.0
+
+	// The order of duplicate-row results must not matter; verify both orderings.
+	testCases := map[string]struct {
+		results []*source.RAMUsageMaxResult
+	}{
+		"large first, small second": {
+			results: []*source.RAMUsageMaxResult{
+				ramUsageMaxResult(podKey1, container, large),
+				ramUsageMaxResult(podKey1, container, small),
+			},
+		},
+		"small first, large second": {
+			results: []*source.RAMUsageMaxResult{
+				ramUsageMaxResult(podKey1, container, small),
+				ramUsageMaxResult(podKey1, container, large),
+			},
+		},
+	}
+
+	for name, tc := range testCases {
+		t.Run(name, func(t *testing.T) {
+			podMap := makePodMapWithEmptyAllocations(podKey1)
+
+			applyRAMBytesUsedMax(podMap, tc.results, map[podKey][]podKey{})
+
+			alloc, ok := podMap[podKey1].Allocations[container]
+			if !ok {
+				t.Fatalf("container allocation %q was not created", container)
+			}
+			if alloc.RawAllocationOnly == nil {
+				t.Fatalf("RawAllocationOnly was not populated for container %q", container)
+			}
+			if got := alloc.RawAllocationOnly.RAMBytesUsageMax; got != large {
+				t.Errorf("RAMBytesUsageMax = %v; want %v (max across duplicate rows)", got, large)
+			}
+		})
+	}
+}
+
+// TestApplyCPUCoresUsedMax_KeepsLargestAcrossDuplicateRows is the CPU analogue
+// of TestApplyRAMBytesUsedMax_KeepsLargestAcrossDuplicateRows.
+func TestApplyCPUCoresUsedMax_KeepsLargestAcrossDuplicateRows(t *testing.T) {
+	const container = "c1"
+	const small = 0.01
+	const large = 1.75
+
+	testCases := map[string]struct {
+		results []*source.CPUUsageMaxResult
+	}{
+		"large first, small second": {
+			results: []*source.CPUUsageMaxResult{
+				cpuUsageMaxResult(podKey1, container, large),
+				cpuUsageMaxResult(podKey1, container, small),
+			},
+		},
+		"small first, large second": {
+			results: []*source.CPUUsageMaxResult{
+				cpuUsageMaxResult(podKey1, container, small),
+				cpuUsageMaxResult(podKey1, container, large),
+			},
+		},
+	}
+
+	for name, tc := range testCases {
+		t.Run(name, func(t *testing.T) {
+			podMap := makePodMapWithEmptyAllocations(podKey1)
+
+			applyCPUCoresUsedMax(podMap, tc.results, map[podKey][]podKey{})
+
+			alloc, ok := podMap[podKey1].Allocations[container]
+			if !ok {
+				t.Fatalf("container allocation %q was not created", container)
+			}
+			if alloc.RawAllocationOnly == nil {
+				t.Fatalf("RawAllocationOnly was not populated for container %q", container)
+			}
+			if got := alloc.RawAllocationOnly.CPUCoreUsageMax; got != large {
+				t.Errorf("CPUCoreUsageMax = %v; want %v (max across duplicate rows)", got, large)
+			}
+		})
+	}
+}
+
+// TestApplyGPUUsageMax_KeepsLargestAcrossDuplicateRows is the GPU analogue.
+// It additionally asserts the behavior for the pointer-valued GPUUsageMax
+// field, covering both the fresh-create and update paths.
+func TestApplyGPUUsageMax_KeepsLargestAcrossDuplicateRows(t *testing.T) {
+	const container = "c1"
+	const small = 0.02
+	const large = 0.97
+
+	testCases := map[string]struct {
+		results []*source.GPUsUsageMaxResult
+	}{
+		"large first, small second": {
+			results: []*source.GPUsUsageMaxResult{
+				gpuUsageMaxResult(podKey1, container, large),
+				gpuUsageMaxResult(podKey1, container, small),
+			},
+		},
+		"small first, large second": {
+			results: []*source.GPUsUsageMaxResult{
+				gpuUsageMaxResult(podKey1, container, small),
+				gpuUsageMaxResult(podKey1, container, large),
+			},
+		},
+	}
+
+	for name, tc := range testCases {
+		t.Run(name, func(t *testing.T) {
+			podMap := makePodMapWithEmptyAllocations(podKey1)
+
+			applyGPUUsageMax(podMap, tc.results, map[podKey][]podKey{})
+
+			alloc, ok := podMap[podKey1].Allocations[container]
+			if !ok {
+				t.Fatalf("container allocation %q was not created", container)
+			}
+			if alloc.RawAllocationOnly == nil {
+				t.Fatalf("RawAllocationOnly was not populated for container %q", container)
+			}
+			ptr := alloc.RawAllocationOnly.GPUUsageMax
+			if ptr == nil {
+				t.Fatalf("GPUUsageMax was nil; expected %v", large)
+			}
+			if *ptr != large {
+				t.Errorf("GPUUsageMax = %v; want %v (max across duplicate rows)", *ptr, large)
+			}
+		})
+	}
+}