Просмотр исходного кода

Add logging to investigate pod count mismatch

Christian Petersen 1 месяц назад
Родитель
Сommit
6ce00e7178

+ 18 - 0
pkg/costmodel/allocation.go

@@ -537,7 +537,21 @@ func (cm *CostModel) computeAllocation(start, end time.Time) (*opencost.Allocati
 	cm.applyNodesToPod(podMap, nodeMap)
 
 	// (3) Build out AllocationSet from Pod map
+	emittedPods := 0
+	droppedNoContainers := 0
+	var droppedSamples []string
 	for _, pod := range podMap {
+		if len(pod.Allocations) == 0 {
+			// Pod was observed (e.g. via kube_pod_container_status_running) but no
+			// container allocation data was ever attached. This is the most common
+			// reason a short-lived pod is in Prometheus but missing from /allocation.
+			droppedNoContainers++
+			if len(droppedSamples) < 10 {
+				droppedSamples = append(droppedSamples, pod.Key.String())
+			}
+			continue
+		}
+		emittedPods++
 		for _, alloc := range pod.Allocations {
 			cluster := alloc.Properties.Cluster
 			nodeName := alloc.Properties.Node
@@ -551,6 +565,10 @@ func (cm *CostModel) computeAllocation(start, end time.Time) (*opencost.Allocati
 			allocSet.Set(alloc)
 		}
 	}
+	if droppedNoContainers > 0 {
+		log.Debugf("CostModel.ComputeAllocation: emitted %d pods, dropped %d pods with no container allocations (sample: %v)",
+			emittedPods, droppedNoContainers, droppedSamples)
+	}
 
 	return allocSet, nodeMap, nil
 }

+ 13 - 0
pkg/costmodel/allocation_helpers.go

@@ -77,16 +77,24 @@ func (cm *CostModel) buildPodMap(window opencost.Window, podMap map[podKey]*pod,
 	// so filter out the non-containing results so we don't duplicate pods. This is due to the
 	// default setup of Kubecost having replicated kube_pod_container_status_running and
 	// included KSM kube_pod_container_status_running. Querying w/ UID will return both.
+	preFilterCount := len(resPods)
 	if ingestPodUID {
 		var resPodsUID []*source.PodsResult
+		var droppedNoUID []string
 
 		for _, res := range resPods {
 			uid := res.UID
 			if uid != "" {
 				resPodsUID = append(resPodsUID, res)
+			} else {
+				droppedNoUID = append(droppedNoUID, fmt.Sprintf("%s/%s", res.Namespace, res.Pod))
 			}
 		}
 
+		if len(droppedNoUID) > 0 {
+			log.Debugf("CostModel.ComputeAllocation: dropped %d pod results lacking UID during UID-ingestion filter: %v", len(droppedNoUID), droppedNoUID)
+		}
+
 		if len(resPodsUID) > 0 {
 			resPods = resPodsUID
 		} else {
@@ -94,7 +102,10 @@ func (cm *CostModel) buildPodMap(window opencost.Window, podMap map[podKey]*pod,
 		}
 	}
 
+	preApplyMapSize := len(podMap)
 	applyPodResults(window, resolution, podMap, resPods, ingestPodUID, podUIDKeyMap)
+	log.Debugf("CostModel.ComputeAllocation: buildPodMap window=%s: prom returned %d pod results, %d post-UID-filter, podMap grew by %d (now %d)",
+		window, preFilterCount, len(resPods), len(podMap)-preApplyMapSize, len(podMap))
 
 	return nil
 }
@@ -143,6 +154,8 @@ func applyPodResults(window opencost.Window, resolution time.Duration, podMap ma
 
 		allocStart, allocEnd := calculateStartAndEnd(res.Data, resolution, window)
 		if allocStart.IsZero() || allocEnd.IsZero() {
+			log.Debugf("CostModel.ComputeAllocation: dropping pod %s/%s (cluster=%s, uid=%q): zero allocStart/allocEnd from %d data points, window=%s",
+				namespace, podName, cluster, res.UID, len(res.Data), window)
 			continue
 		}
 

+ 10 - 4
pkg/costmodel/assets.go

@@ -163,11 +163,17 @@ func (cm *CostModel) ComputeAssets(start, end time.Time) (*opencost.AssetSet, er
 		node.RAMCost = n.RAMCost
 
 		if n.Overhead != nil {
+			// node.TotalCost() == 0 would produce NaN, which is not JSON-representable
+			// and breaks downstream consumers (e.g. the MCP marshaler).
+			overheadCostFraction := 0.0
+			if total := node.TotalCost(); total > 0 {
+				overheadCostFraction = ((n.Overhead.CpuOverheadFraction * n.CPUCost) +
+					(n.Overhead.RamOverheadFraction * n.RAMCost)) / total
+			}
 			node.Overhead = &opencost.NodeOverhead{
-				RamOverheadFraction: n.Overhead.RamOverheadFraction,
-				CpuOverheadFraction: n.Overhead.CpuOverheadFraction,
-				OverheadCostFraction: ((n.Overhead.CpuOverheadFraction * n.CPUCost) +
-					(n.Overhead.RamOverheadFraction * n.RAMCost)) / node.TotalCost(),
+				RamOverheadFraction:  n.Overhead.RamOverheadFraction,
+				CpuOverheadFraction:  n.Overhead.CpuOverheadFraction,
+				OverheadCostFraction: overheadCostFraction,
 			}
 		} else {
 			node.Overhead = &opencost.NodeOverhead{}

+ 13 - 5
pkg/costmodel/cluster_helpers.go

@@ -446,9 +446,14 @@ func buildOverheadMap(capRam, allocRam, capCPU, allocCPU map[nodeIdentifierNoPro
 			log.Warnf("Could not find allocatable ram for node %s", identifier.Name)
 			continue
 		}
-		overheadBytes := ramCapacity - allocatableRam
+		// ramCapacity == 0 would produce NaN, which is not JSON-representable
+		// and breaks downstream consumers (e.g. the MCP marshaler).
+		ramFraction := 0.0
+		if ramCapacity > 0 {
+			ramFraction = (ramCapacity - allocatableRam) / ramCapacity
+		}
 		m[identifier] = &NodeOverhead{
-			RamOverheadFraction: overheadBytes / ramCapacity,
+			RamOverheadFraction: ramFraction,
 		}
 	}
 
@@ -459,13 +464,16 @@ func buildOverheadMap(capRam, allocRam, capCPU, allocCPU map[nodeIdentifierNoPro
 			continue
 		}
 
-		overhead := cpuCapacity - allocatableCPU
+		cpuFraction := 0.0
+		if cpuCapacity > 0 {
+			cpuFraction = (cpuCapacity - allocatableCPU) / cpuCapacity
+		}
 
 		if _, found := m[identifier]; found {
-			m[identifier].CpuOverheadFraction = overhead / cpuCapacity
+			m[identifier].CpuOverheadFraction = cpuFraction
 		} else {
 			m[identifier] = &NodeOverhead{
-				CpuOverheadFraction: overhead / cpuCapacity,
+				CpuOverheadFraction: cpuFraction,
 			}
 		}