|
|
@@ -325,6 +325,19 @@ func (cmme *CostModelMetricsEmitter) IsRunning() bool {
|
|
|
return cmme.recordingStop != nil
|
|
|
}
|
|
|
|
|
|
+// NodeCostAverages tracks a running average of a node's cost attributes.
|
|
|
+// The averages are used to detect and discard spurrious outliers.
|
|
|
+type NodeCostAverages struct {
|
|
|
+ CpuCostAverage float64
|
|
|
+ RamCostAverage float64
|
|
|
+ GpuCostAverage float64
|
|
|
+ TotalCostAverage float64
|
|
|
+ NumCpuDataPoints float64
|
|
|
+ NumRamDataPoints float64
|
|
|
+ NumGpuDataPoints float64
|
|
|
+ NumTotalCostDataPoints float64
|
|
|
+}
|
|
|
+
|
|
|
// StartCostModelMetricRecording starts the go routine that emits metrics used to determine
|
|
|
// cluster costs.
|
|
|
func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
@@ -344,6 +357,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
loadBalancerSeen := make(map[string]bool)
|
|
|
pvSeen := make(map[string]bool)
|
|
|
pvcSeen := make(map[string]bool)
|
|
|
+ nodeCostAverages := make(map[string]NodeCostAverages)
|
|
|
|
|
|
getKeyFromLabelStrings := func(labels ...string) string {
|
|
|
return strings.Join(labels, ",")
|
|
|
@@ -450,17 +464,67 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
|
|
|
totalCost := cpu*cpuCost + ramCost*(ram/1024/1024/1024) + gpu*gpuCost
|
|
|
|
|
|
- cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(cpuCost)
|
|
|
- cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(ramCost)
|
|
|
- cmme.GPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpuCost)
|
|
|
+ labelKey := getKeyFromLabelStrings(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID)
|
|
|
+
|
|
|
+ avgCosts, ok := nodeCostAverages[labelKey]
|
|
|
+
|
|
|
+ // initialize average cost tracking for this node if there is none
|
|
|
+ if !ok {
|
|
|
+ avgCosts = NodeCostAverages{
|
|
|
+ CpuCostAverage: cpuCost,
|
|
|
+ RamCostAverage: ramCost,
|
|
|
+ GpuCostAverage: gpuCost,
|
|
|
+ TotalCostAverage: totalCost,
|
|
|
+ NumCpuDataPoints: 1,
|
|
|
+ NumRamDataPoints: 1,
|
|
|
+ NumGpuDataPoints: 1,
|
|
|
+ NumTotalCostDataPoints: 1,
|
|
|
+ }
|
|
|
+ nodeCostAverages[labelKey] = avgCosts
|
|
|
+ }
|
|
|
+
|
|
|
cmme.GPUCountRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpu)
|
|
|
- cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(totalCost)
|
|
|
+
|
|
|
+ const outlierFactor float64 = 30
|
|
|
+ // don't record cpuCost, ramCost, or gpuCost in the case of wild outliers
|
|
|
+ // k8s api sometimes causes cost spikes as described here:
|
|
|
+ // https://github.com/kubecost/cost-model/issues/927
|
|
|
+ if cpuCost < outlierFactor*avgCosts.CpuCostAverage {
|
|
|
+ cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(cpuCost)
|
|
|
+ avgCosts.CpuCostAverage = (avgCosts.CpuCostAverage*avgCosts.NumCpuDataPoints + cpuCost) / (avgCosts.NumCpuDataPoints + 1)
|
|
|
+ avgCosts.NumCpuDataPoints += 1
|
|
|
+ } else {
|
|
|
+ log.Warningf("CPU cost outlier detected; skipping data point.")
|
|
|
+ }
|
|
|
+ if ramCost < outlierFactor*avgCosts.RamCostAverage {
|
|
|
+ cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(ramCost)
|
|
|
+ avgCosts.RamCostAverage = (avgCosts.RamCostAverage*avgCosts.NumRamDataPoints + ramCost) / (avgCosts.NumRamDataPoints + 1)
|
|
|
+ avgCosts.NumRamDataPoints += 1
|
|
|
+ } else {
|
|
|
+ log.Warningf("RAM cost outlier detected; skipping data point.")
|
|
|
+ }
|
|
|
+ if gpuCost < outlierFactor*avgCosts.GpuCostAverage {
|
|
|
+ cmme.GPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpuCost)
|
|
|
+ avgCosts.GpuCostAverage = (avgCosts.GpuCostAverage*avgCosts.NumGpuDataPoints + gpuCost) / (avgCosts.NumGpuDataPoints + 1)
|
|
|
+ avgCosts.NumGpuDataPoints += 1
|
|
|
+ } else {
|
|
|
+ log.Warningf("GPU cost outlier detected; skipping data point.")
|
|
|
+ }
|
|
|
+ // skip redording totalCost if any constituent costs were outliers
|
|
|
+ if cpuCost < outlierFactor*avgCosts.CpuCostAverage &&
|
|
|
+ ramCost < outlierFactor*avgCosts.RamCostAverage &&
|
|
|
+ gpuCost < outlierFactor*avgCosts.GpuCostAverage {
|
|
|
+
|
|
|
+ cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(totalCost)
|
|
|
+ avgCosts.TotalCostAverage = (avgCosts.TotalCostAverage*avgCosts.NumTotalCostDataPoints + totalCost) / (avgCosts.NumTotalCostDataPoints + 1)
|
|
|
+ avgCosts.NumTotalCostDataPoints += 1
|
|
|
+ }
|
|
|
+
|
|
|
if node.IsSpot() {
|
|
|
cmme.NodeSpotRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(1.0)
|
|
|
} else {
|
|
|
cmme.NodeSpotRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(0.0)
|
|
|
}
|
|
|
- labelKey := getKeyFromLabelStrings(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID)
|
|
|
nodeSeen[labelKey] = true
|
|
|
}
|
|
|
|
|
|
@@ -606,6 +670,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
|
|
|
klog.Infof("FAILURE TO REMOVE %s from ramprice", labelString)
|
|
|
}
|
|
|
delete(nodeSeen, labelString)
|
|
|
+ delete(nodeCostAverages, labelString)
|
|
|
} else {
|
|
|
nodeSeen[labelString] = false
|
|
|
}
|