Przeglądaj źródła

add guardrail based on max CPU count

Ajay Tripathy 4 lat temu
rodzic
commit
052b4825ae
1 zmienionych plików z 18 dodań i 0 usunięć
  1. 18 0
      pkg/costmodel/allocation.go

+ 18 - 0
pkg/costmodel/allocation.go

@@ -15,6 +15,7 @@ import (
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/prom"
 	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/klog"
 )
 
 const (
@@ -68,6 +69,11 @@ const (
 	queryFmtLBActiveMins          = `count(kubecost_load_balancer_cost) by (namespace, service_name, %s)[%s:%s]%s`
 )
 
+// This is a bit of a hack to work around garbage data from cadvisor
+// The maximum CPU count on AWS is 96. Standard_M192idms_v2 on azure allows 192. GCP offers 224 in n2ds. Many improvements here are possible to get lower bounnds, but 96 seems reasonable now.
+// Ideally you cap each pod to the max CPU on its node, but that involves a bit more complexity, as it it would need to be done when allocations joins with asset data.
+const MAX_CPU_CAP = 96
+
 // CanCompute should return true if CostModel can act as a valid source for the
 // given time range. In the case of CostModel we want to attempt to compute as
 // long as the range starts in the past. If the CostModel ends up not having
@@ -647,6 +653,10 @@ func applyCPUCoresAllocated(podMap map[podKey]*Pod, resCPUCoresAllocated []*prom
 		}
 
 		cpuCores := res.Values[0].Value
+		if cpuCores > MAX_CPU_CAP {
+			klog.Infof("[WARNING] Very large cpu allocation, clamping to %f", res.Values[0].Value*(pod.Allocations[container].Minutes()/60.0))
+			cpuCores = 0.0
+		}
 		hours := pod.Allocations[container].Minutes() / 60.0
 		pod.Allocations[container].CPUCoreHours = cpuCores * hours
 
@@ -689,6 +699,10 @@ func applyCPUCoresRequested(podMap map[podKey]*Pod, resCPUCoresRequested []*prom
 		if pod.Allocations[container].CPUCores() < res.Values[0].Value {
 			pod.Allocations[container].CPUCoreHours = res.Values[0].Value * (pod.Allocations[container].Minutes() / 60.0)
 		}
+		if pod.Allocations[container].CPUCores() > MAX_CPU_CAP {
+			klog.Infof("[WARNING] Very large cpu allocation, clamping! to %f", res.Values[0].Value*(pod.Allocations[container].Minutes()/60.0))
+			pod.Allocations[container].CPUCoreHours = res.Values[0].Value * (pod.Allocations[container].Minutes() / 60.0)
+		}
 
 		node, err := res.GetString("node")
 		if err != nil {
@@ -725,6 +739,10 @@ func applyCPUCoresUsedAvg(podMap map[podKey]*Pod, resCPUCoresUsedAvg []*prom.Que
 		}
 
 		pod.Allocations[container].CPUCoreUsageAverage = res.Values[0].Value
+		if res.Values[0].Value > MAX_CPU_CAP {
+			klog.Infof("[WARNING] Very large cpu USAGE, dropping outlier")
+			pod.Allocations[container].CPUCoreUsageAverage = 0.0
+		}
 	}
 }