Explorar o código

Modifies metric emission loop to skip emitting cost data points when
cost spikes to > 30x a running average.

Occasionally, the k8s API emits a spurrious ~0 RAM capacity reading,
which causes computed RAM cost to skyrocket for that timepoint. This
change keeps an in-memory average of data points and discards wild
outliers.

Neal Ormsbee %!s(int64=4) %!d(string=hai) anos
pai
achega
e634de0401
Modificáronse 1 ficheiros con 70 adicións e 5 borrados
  1. 70 5
      pkg/costmodel/metrics.go

+ 70 - 5
pkg/costmodel/metrics.go

@@ -325,6 +325,19 @@ func (cmme *CostModelMetricsEmitter) IsRunning() bool {
 	return cmme.recordingStop != nil
 }
 
+// NodeCostAverages tracks a running average of a node's cost attributes.
+// The averages are used to detect and discard spurrious outliers.
+type NodeCostAverages struct {
+	CpuCostAverage         float64
+	RamCostAverage         float64
+	GpuCostAverage         float64
+	TotalCostAverage       float64
+	NumCpuDataPoints       float64
+	NumRamDataPoints       float64
+	NumGpuDataPoints       float64
+	NumTotalCostDataPoints float64
+}
+
 // StartCostModelMetricRecording starts the go routine that emits metrics used to determine
 // cluster costs.
 func (cmme *CostModelMetricsEmitter) Start() bool {
@@ -344,6 +357,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 		loadBalancerSeen := make(map[string]bool)
 		pvSeen := make(map[string]bool)
 		pvcSeen := make(map[string]bool)
+		nodeCostAverages := make(map[string]NodeCostAverages)
 
 		getKeyFromLabelStrings := func(labels ...string) string {
 			return strings.Join(labels, ",")
@@ -450,17 +464,67 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 
 				totalCost := cpu*cpuCost + ramCost*(ram/1024/1024/1024) + gpu*gpuCost
 
-				cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(cpuCost)
-				cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(ramCost)
-				cmme.GPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpuCost)
+				labelKey := getKeyFromLabelStrings(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID)
+
+				avgCosts, ok := nodeCostAverages[labelKey]
+
+				// initialize average cost tracking for this node if there is none
+				if !ok {
+					avgCosts = NodeCostAverages{
+						CpuCostAverage:         cpuCost,
+						RamCostAverage:         ramCost,
+						GpuCostAverage:         gpuCost,
+						TotalCostAverage:       totalCost,
+						NumCpuDataPoints:       1,
+						NumRamDataPoints:       1,
+						NumGpuDataPoints:       1,
+						NumTotalCostDataPoints: 1,
+					}
+					nodeCostAverages[labelKey] = avgCosts
+				}
+
 				cmme.GPUCountRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpu)
-				cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(totalCost)
+
+				const outlierFactor float64 = 30
+				// don't record cpuCost, ramCost, or gpuCost in the case of wild outliers
+				// k8s api sometimes causes cost spikes as described here:
+				// https://github.com/kubecost/cost-model/issues/927
+				if cpuCost < outlierFactor*avgCosts.CpuCostAverage {
+					cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(cpuCost)
+					avgCosts.CpuCostAverage = (avgCosts.CpuCostAverage*avgCosts.NumCpuDataPoints + cpuCost) / (avgCosts.NumCpuDataPoints + 1)
+					avgCosts.NumCpuDataPoints += 1
+				} else {
+					log.Warningf("CPU cost outlier detected; skipping data point.")
+				}
+				if ramCost < outlierFactor*avgCosts.RamCostAverage {
+					cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(ramCost)
+					avgCosts.RamCostAverage = (avgCosts.RamCostAverage*avgCosts.NumRamDataPoints + ramCost) / (avgCosts.NumRamDataPoints + 1)
+					avgCosts.NumRamDataPoints += 1
+				} else {
+					log.Warningf("RAM cost outlier detected; skipping data point.")
+				}
+				if gpuCost < outlierFactor*avgCosts.GpuCostAverage {
+					cmme.GPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpuCost)
+					avgCosts.GpuCostAverage = (avgCosts.GpuCostAverage*avgCosts.NumGpuDataPoints + gpuCost) / (avgCosts.NumGpuDataPoints + 1)
+					avgCosts.NumGpuDataPoints += 1
+				} else {
+					log.Warningf("GPU cost outlier detected; skipping data point.")
+				}
+				// skip redording totalCost if any constituent costs were outliers
+				if cpuCost < outlierFactor*avgCosts.CpuCostAverage &&
+					ramCost < outlierFactor*avgCosts.RamCostAverage &&
+					gpuCost < outlierFactor*avgCosts.GpuCostAverage {
+
+					cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(totalCost)
+					avgCosts.TotalCostAverage = (avgCosts.TotalCostAverage*avgCosts.NumTotalCostDataPoints + totalCost) / (avgCosts.NumTotalCostDataPoints + 1)
+					avgCosts.NumTotalCostDataPoints += 1
+				}
+
 				if node.IsSpot() {
 					cmme.NodeSpotRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(1.0)
 				} else {
 					cmme.NodeSpotRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(0.0)
 				}
-				labelKey := getKeyFromLabelStrings(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID)
 				nodeSeen[labelKey] = true
 			}
 
@@ -606,6 +670,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 						klog.Infof("FAILURE TO REMOVE %s from ramprice", labelString)
 					}
 					delete(nodeSeen, labelString)
+					delete(nodeCostAverages, labelString)
 				} else {
 					nodeSeen[labelString] = false
 				}