4 lat temu · eb8e55ec05
--- a/pkg/costmodel/allocation.go
+++ b/pkg/costmodel/allocation.go
@@ -30,6 +30,7 @@ const (
 
				 	queryFmtCPUUsageMax              = `max(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD"}[%s]%s)) by (container_name, container, pod_name, pod, namespace, instance, %s)`
			
 
				 	queryFmtGPUsRequested            = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, %s)`
			
 
				 	queryFmtGPUsAllocated            = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, %s)`
			
 
				+	queryFmtGPUUsageAvg              = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s]%s)) by (container, pod, namespace, %s)`
			
 
				 	queryFmtNodeCostPerCPUHr         = `avg(avg_over_time(node_cpu_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
			
 
				 	queryFmtNodeCostPerRAMGiBHr      = `avg(avg_over_time(node_ram_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
			
 
				 	queryFmtNodeCostPerGPUHr         = `avg(avg_over_time(node_gpu_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
			
@@ -158,6 +159,9 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 
				 	queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, durStr, offStr, env.GetPromClusterLabel())
			
 
				 	resChGPUsAllocated := ctx.Query(queryGPUsAllocated)
			
 
				 
			
 
				+	queryGPUUsageAvg := fmt.Sprintf(queryFmtGPUUsageAvg, durStr, offStr, env.GetPromClusterLabel())
			
 
				+	resChGPUUsageAvg := ctx.Query(queryGPUUsageAvg)
			
 
				+
			
 
				 	queryNodeCostPerCPUHr := fmt.Sprintf(queryFmtNodeCostPerCPUHr, durStr, offStr, env.GetPromClusterLabel())
			
 
				 	resChNodeCostPerCPUHr := ctx.Query(queryNodeCostPerCPUHr)
			
 
				 
			
@@ -258,6 +262,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 
				 	resRAMUsageMax, _ := resChRAMUsageMax.Await()
			
 
				 	resGPUsRequested, _ := resChGPUsRequested.Await()
			
 
				 	resGPUsAllocated, _ := resChGPUsAllocated.Await()
			
 
				+	resGPUUsageAvg, _ := resChGPUUsageAvg.Await()
			
 
				 
			
 
				 	resNodeCostPerCPUHr, _ := resChNodeCostPerCPUHr.Await()
			
 
				 	resNodeCostPerRAMGiBHr, _ := resChNodeCostPerRAMGiBHr.Await()
			
@@ -314,6 +319,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 
				 	applyRAMBytesUsedAvg(podMap, resRAMUsageAvg)
			
 
				 	applyRAMBytesUsedMax(podMap, resRAMUsageMax)
			
 
				 	applyGPUsAllocated(podMap, resGPUsRequested, resGPUsAllocated)
			
 
				+	applyGPUUsageAvg(podMap, resGPUUsageAvg)
			
 
				 	applyNetworkTotals(podMap, resNetTransferBytes, resNetReceiveBytes)
			
 
				 	applyNetworkAllocation(podMap, resNetZoneGiB, resNetZoneCostPerGiB)
			
 
				 	applyNetworkAllocation(podMap, resNetRegionGiB, resNetRegionCostPerGiB)
			
@@ -954,6 +960,35 @@ func applyGPUsAllocated(podMap map[podKey]*Pod, resGPUsRequested []*prom.QueryRe
 
				 
			
 
				 		hrs := pod.Allocations[container].Minutes() / 60.0
			
 
				 		pod.Allocations[container].GPUHours = res.Values[0].Value * hrs
			
 
				+		pod.Allocations[container].GPURequestAverage = res.Values[0].Value
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func applyGPUUsageAvg(podMap map[podKey]*Pod, resGPUUsageAvg []*prom.QueryResult) {
			
 
				+	for _, res := range resGPUUsageAvg {
			
 
				+		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
			
 
				+		if err != nil {
			
 
				+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg result missing field: %s", err)
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		pod, ok := podMap[key]
			
 
				+		if !ok {
			
 
				+			continue
			
 
				+		}
			
 
				+		container, err := res.GetString("container")
			
 
				+		if err != nil {
			
 
				+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		if _, ok := pod.Allocations[container]; !ok {
			
 
				+			pod.AppendContainer(container)
			
 
				+		}
			
 
				+
			
 
				+		// Metric represents percentages as full numbers, so scale down by factor of 100 to make consistent
			
 
				+		// with other efficiency fields
			
 
				+		pod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/pkg/kubecost/allocation.go
+++ b/pkg/kubecost/allocation.go
@@ -61,6 +61,8 @@ type Allocation struct {
 
				 	CPUCost                    float64               `json:"cpuCost"`
			
 
				 	CPUCostAdjustment          float64               `json:"cpuCostAdjustment"`
			
 
				 	GPUHours                   float64               `json:"gpuHours"`
			
 
				+	GPURequestAverage          float64               `json:"gpuRequestAverage"`
			
 
				+	GPUUsageAverage            float64               `json:"gpuUsageAverage"`
			
 
				 	GPUCost                    float64               `json:"gpuCost"`
			
 
				 	GPUCostAdjustment          float64               `json:"gpuCostAdjustment"`
			
 
				 	NetworkTransferBytes       float64               `json:"networkTransferBytes"`
			
@@ -205,6 +207,8 @@ func (a *Allocation) Clone() *Allocation {
 
				 		CPUCost:                    a.CPUCost,
			
 
				 		CPUCostAdjustment:          a.CPUCostAdjustment,
			
 
				 		GPUHours:                   a.GPUHours,
			
 
				+		GPURequestAverage:          a.GPURequestAverage,
			
 
				+		GPUUsageAverage:            a.GPUUsageAverage,
			
 
				 		GPUCost:                    a.GPUCost,
			
 
				 		GPUCostAdjustment:          a.GPUCostAdjustment,
			
 
				 		NetworkTransferBytes:       a.NetworkTransferBytes,
			
@@ -274,6 +278,12 @@ func (a *Allocation) Equal(that *Allocation) bool {
 
				 	if !util.IsApproximately(a.GPUHours, that.GPUHours) {
			
 
				 		return false
			
 
				 	}
			
 
				+	if !util.IsApproximately(a.GPURequestAverage, that.GPURequestAverage) {
			
 
				+		return false
			
 
				+	}
			
 
				+	if !util.IsApproximately(a.GPUUsageAverage, that.GPUUsageAverage) {
			
 
				+		return false
			
 
				+	}
			
 
				 	if !util.IsApproximately(a.GPUCost, that.GPUCost) {
			
 
				 		return false
			
 
				 	}
			
@@ -436,6 +446,23 @@ func (a *Allocation) RAMEfficiency() float64 {
 
				 	return 1.0
			
 
				 }
			
 
				 
			
 
				+// GPUEfficiency is the ratio of usage to request. If there is no request and
			
 
				+// no usage or cost, then efficiency is zero. If there is no request, but there
			
 
				+// is usage or cost, then efficiency is 100%. Note that, without the Nvidia dcgm
			
 
				+// providing Prometheus with usage metrics, this will always be zero, as
			
 
				+// GPUUsageAverage will be zero (the default value).
			
 
				+func (a *Allocation) GPUEfficiency() float64 {
			
 
				+	if a.GPURequestAverage > 0 && a.GPUUsageAverage > 0 {
			
 
				+		return a.GPUUsageAverage / a.GPURequestAverage
			
 
				+	}
			
 
				+
			
 
				+	if a.GPUUsageAverage == 0.0 || a.GPUCost == 0.0 {
			
 
				+		return 0.0
			
 
				+	}
			
 
				+
			
 
				+	return 1.0
			
 
				+}
			
 
				+
			
 
				 // TotalEfficiency is the cost-weighted average of CPU and RAM efficiency. If
			
 
				 // there is no cost at all, then efficiency is zero.
			
 
				 func (a *Allocation) TotalEfficiency() float64 {
			
@@ -508,6 +535,8 @@ func (a *Allocation) MarshalJSON() ([]byte, error) {
 
				 	jsonEncodeFloat64(buffer, "cpuEfficiency", a.CPUEfficiency(), ",")
			
 
				 	jsonEncodeFloat64(buffer, "gpuCount", a.GPUs(), ",")
			
 
				 	jsonEncodeFloat64(buffer, "gpuHours", a.GPUHours, ",")
			
 
				+	jsonEncodeFloat64(buffer, "gpuRequestAverage", a.GPURequestAverage, ",")
			
 
				+	jsonEncodeFloat64(buffer, "gpuUsageAverage", a.GPUUsageAverage, ",")
			
 
				 	jsonEncodeFloat64(buffer, "gpuCost", a.GPUCost, ",")
			
 
				 	jsonEncodeFloat64(buffer, "gpuCostAdjustment", a.GPUCostAdjustment, ",")
			
 
				 	jsonEncodeFloat64(buffer, "networkTransferBytes", a.NetworkTransferBytes, ",")
			
@@ -652,6 +681,12 @@ func (a *Allocation) add(that *Allocation) {
 
				 	ramUseByteMins := a.RAMBytesUsageAverage * a.Minutes()
			
 
				 	ramUseByteMins += that.RAMBytesUsageAverage * that.Minutes()
			
 
				 
			
 
				+	gpuReqMins := a.GPURequestAverage * a.Minutes()
			
 
				+	gpuReqMins += that.GPURequestAverage * that.Minutes()
			
 
				+
			
 
				+	gpuUseMins := a.GPUUsageAverage * a.Minutes()
			
 
				+	gpuUseMins += that.GPUUsageAverage * that.Minutes()
			
 
				+
			
 
				 	// Expand Start and End to be the "max" of among the given Allocations
			
 
				 	if that.Start.Before(a.Start) {
			
 
				 		a.Start = that.Start
			
@@ -667,11 +702,15 @@ func (a *Allocation) add(that *Allocation) {
 
				 		a.CPUCoreUsageAverage = cpuUseCoreMins / a.Minutes()
			
 
				 		a.RAMBytesRequestAverage = ramReqByteMins / a.Minutes()
			
 
				 		a.RAMBytesUsageAverage = ramUseByteMins / a.Minutes()
			
 
				+		a.GPURequestAverage = gpuReqMins / a.Minutes()
			
 
				+		a.GPUUsageAverage = gpuUseMins / a.Minutes()
			
 
				 	} else {
			
 
				 		a.CPUCoreRequestAverage = 0.0
			
 
				 		a.CPUCoreUsageAverage = 0.0
			
 
				 		a.RAMBytesRequestAverage = 0.0
			
 
				 		a.RAMBytesUsageAverage = 0.0
			
 
				+		a.GPURequestAverage = 0.0
			
 
				+		a.GPUUsageAverage = 0.0
			
 
				 	}
			
 
				 
			
 
				 	// Sum all cumulative resource fields
			
--- a/pkg/kubecost/allocation_test.go
+++ b/pkg/kubecost/allocation_test.go
@@ -49,6 +49,8 @@ func TestAllocation_Add(t *testing.T) {
 
				 		CPUCoreRequestAverage: 2.0,
			
 
				 		CPUCoreUsageAverage:   1.0,
			
 
				 		CPUCost:               2.0 * hrs1 * cpuPrice,
			
 
				+		GPURequestAverage:     1.0,
			
 
				+		GPUUsageAverage:       0.70,
			
 
				 		CPUCostAdjustment:     3.0,
			
 
				 		GPUHours:              1.0 * hrs1,
			
 
				 		GPUCost:               1.0 * hrs1 * gpuPrice,
			
@@ -83,6 +85,8 @@ func TestAllocation_Add(t *testing.T) {
 
				 		CPUCoreUsageAverage:    1.0,
			
 
				 		CPUCost:                1.0 * hrs2 * cpuPrice,
			
 
				 		GPUHours:               0.0,
			
 
				+		GPURequestAverage:      1.0,
			
 
				+		GPUUsageAverage:        0.30,
			
 
				 		GPUCost:                0.0,
			
 
				 		RAMByteHours:           8.0 * gib * hrs2,
			
 
				 		RAMBytesRequestAverage: 0.0,
			
@@ -171,6 +175,8 @@ func TestAllocation_Add(t *testing.T) {
 
				 	// CPU usage = (1.0*12.0 + 1.0*18.0)/(24.0) = 1.25
			
 
				 	// RAM requests = (8.0*12.0 + 0.0*18.0)/(24.0) = 4.00
			
 
				 	// RAM usage = (4.0*12.0 + 8.0*18.0)/(24.0) = 8.00
			
 
				+	// GPU requests = (1.0*12.0 + 1.0*18.0)/(24.0) = 1.25
			
 
				+	// GPU usage = (0.7*12.0 + 0.3*18.0)/(24.0) = 0.575
			
 
				 	if !util.IsApproximately(1.75, act.CPUCoreRequestAverage) {
			
 
				 		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.75, act.CPUCoreRequestAverage)
			
 
				 	}
			
@@ -183,10 +189,17 @@ func TestAllocation_Add(t *testing.T) {
 
				 	if !util.IsApproximately(8.00*gib, act.RAMBytesUsageAverage) {
			
 
				 		t.Fatalf("Allocation.Add: expected %f; actual %f", 8.00*gib, act.RAMBytesUsageAverage)
			
 
				 	}
			
 
				+	if !util.IsApproximately(1.25, act.GPURequestAverage) {
			
 
				+		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.25, act.GPURequestAverage)
			
 
				+	}
			
 
				+	if !util.IsApproximately(0.575, act.GPUUsageAverage) {
			
 
				+		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.575, act.GPUUsageAverage)
			
 
				+	}
			
 
				 
			
 
				 	// Efficiency should be computed accurately from new request/usage
			
 
				 	// CPU efficiency = 1.25/1.75 = 0.7142857
			
 
				 	// RAM efficiency = 8.00/4.00 = 2.0000000
			
 
				+	// GPU efficiency = 0.575/1.25 = 0.46
			
 
				 	// Total efficiency = (0.7142857*0.72 + 2.0*1.92)/(2.64) = 1.6493506
			
 
				 	if !util.IsApproximately(0.7142857, act.CPUEfficiency()) {
			
 
				 		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.7142857, act.CPUEfficiency())
			
@@ -194,6 +207,9 @@ func TestAllocation_Add(t *testing.T) {
 
				 	if !util.IsApproximately(2.0000000, act.RAMEfficiency()) {
			
 
				 		t.Fatalf("Allocation.Add: expected %f; actual %f", 2.0000000, act.RAMEfficiency())
			
 
				 	}
			
 
				+	if !util.IsApproximately(0.46, act.GPUEfficiency()) {
			
 
				+		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.46, act.GPUEfficiency())
			
 
				+	}
			
 
				 	if !util.IsApproximately(1.279690, act.TotalEfficiency()) {
			
 
				 		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.279690, act.TotalEfficiency())
			
 
				 	}
			
@@ -223,6 +239,8 @@ func TestAllocation_Share(t *testing.T) {
 
				 		CPUCost:               2.0 * hrs1 * cpuPrice,
			
 
				 		CPUCostAdjustment:     3.0,
			
 
				 		GPUHours:              1.0 * hrs1,
			
 
				+		GPURequestAverage:     3.0,
			
 
				+		GPUUsageAverage:       0.20,
			
 
				 		GPUCost:               1.0 * hrs1 * gpuPrice,
			
 
				 		GPUCostAdjustment:     2.0,
			
 
				 		PVs: PVAllocations{
			
@@ -254,6 +272,8 @@ func TestAllocation_Share(t *testing.T) {
 
				 		CPUCoreUsageAverage:    1.0,
			
 
				 		CPUCost:                1.0 * hrs2 * cpuPrice,
			
 
				 		GPUHours:               0.0,
			
 
				+		GPURequestAverage:      0.0,
			
 
				+		GPUUsageAverage:        0.0,
			
 
				 		GPUCost:                0.0,
			
 
				 		RAMByteHours:           8.0 * gib * hrs2,
			
 
				 		RAMBytesRequestAverage: 0.0,
			
@@ -342,6 +362,12 @@ func TestAllocation_Share(t *testing.T) {
 
				 	if !util.IsApproximately(a1.RAMBytesUsageAverage, act.RAMBytesUsageAverage) {
			
 
				 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.RAMBytesUsageAverage, act.RAMBytesUsageAverage)
			
 
				 	}
			
 
				+	if !util.IsApproximately(a1.GPURequestAverage, act.GPURequestAverage) {
			
 
				+		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPURequestAverage, act.GPURequestAverage)
			
 
				+	}
			
 
				+	if !util.IsApproximately(a1.GPUUsageAverage, act.GPUUsageAverage) {
			
 
				+		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPUUsageAverage, act.GPUUsageAverage)
			
 
				+	}
			
 
				 
			
 
				 	// Efficiency should match before
			
 
				 	if !util.IsApproximately(a1.CPUEfficiency(), act.CPUEfficiency()) {
			
@@ -350,6 +376,9 @@ func TestAllocation_Share(t *testing.T) {
 
				 	if !util.IsApproximately(a1.RAMEfficiency(), act.RAMEfficiency()) {
			
 
				 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.RAMEfficiency(), act.RAMEfficiency())
			
 
				 	}
			
 
				+	if !util.IsApproximately(a1.GPUEfficiency(), act.GPUEfficiency()) {
			
 
				+		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPUEfficiency(), act.GPUEfficiency())
			
 
				+	}
			
 
				 	if !util.IsApproximately(a1.TotalEfficiency(), act.TotalEfficiency()) {
			
 
				 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.TotalEfficiency(), act.TotalEfficiency())
			
 
				 	}
			
@@ -409,6 +438,8 @@ func TestAllocation_MarshalJSON(t *testing.T) {
 
				 		CPUCost:               2.0 * hrs * cpuPrice,
			
 
				 		CPUCostAdjustment:     3.0,
			
 
				 		GPUHours:              1.0 * hrs,
			
 
				+		GPURequestAverage:     1.0,
			
 
				+		GPUUsageAverage:       0.70,
			
 
				 		GPUCost:               1.0 * hrs * gpuPrice,
			
 
				 		GPUCostAdjustment:     2.0,
			
 
				 		NetworkCost:           0.05,
			
--- a/pkg/kubecost/bingen.go
+++ b/pkg/kubecost/bingen.go
@@ -23,7 +23,7 @@ package kubecost
 
				 // @bingen:end
			
 
				 
			
 
				 // Allocation Version Set: Includes Allocation pipeline specific resources
			
 
				-// @bingen:set[name=Allocation,version=15]
			
 
				+// @bingen:set[name=Allocation,version=16]
			
 
				 // @bingen:generate:Allocation
			
 
				 // @bingen:generate[stringtable]:AllocationSet
			
 
				 // @bingen:generate:AllocationSetRange
			
@@ -37,4 +37,4 @@ package kubecost
 
				 // @bingen:generate:PVAllocation
			
 
				 // @bingen:end
			
 
				 
			
 
				-//go:generate bingen -package=kubecost -version=15 -buffer=github.com/kubecost/cost-model/pkg/util
			
 
				+//go:generate bingen -package=kubecost -version=16 -buffer=github.com/kubecost/cost-model/pkg/util