Explorar el Código

Merge pull request #965 from kubecost/revert-944-kaelan-gpuefficiency

Revert "Support Allocation GPU utilization/efficiency through integration with Nvidia GPU Operator/DCGM."
Sean Holcomb hace 4 años
padre
commit
6f9cccae7a

+ 0 - 35
pkg/costmodel/allocation.go

@@ -30,7 +30,6 @@ const (
 	queryFmtCPUUsageMax              = `max(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD"}[%s]%s)) by (container_name, container, pod_name, pod, namespace, instance, %s)`
 	queryFmtGPUsRequested            = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, %s)`
 	queryFmtGPUsAllocated            = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, %s)`
-	queryFmtGPUUsageAvg              = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s]%s)) by (container, pod, namespace, %s)`
 	queryFmtNodeCostPerCPUHr         = `avg(avg_over_time(node_cpu_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
 	queryFmtNodeCostPerRAMGiBHr      = `avg(avg_over_time(node_ram_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
 	queryFmtNodeCostPerGPUHr         = `avg(avg_over_time(node_gpu_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
@@ -159,9 +158,6 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 	queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, durStr, offStr, env.GetPromClusterLabel())
 	resChGPUsAllocated := ctx.Query(queryGPUsAllocated)
 
-	queryGPUUsageAvg := fmt.Sprintf(queryFmtGPUUsageAvg, durStr, offStr, env.GetPromClusterLabel())
-	resChGPUUsageAvg := ctx.Query(queryGPUUsageAvg)
-
 	queryNodeCostPerCPUHr := fmt.Sprintf(queryFmtNodeCostPerCPUHr, durStr, offStr, env.GetPromClusterLabel())
 	resChNodeCostPerCPUHr := ctx.Query(queryNodeCostPerCPUHr)
 
@@ -262,7 +258,6 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 	resRAMUsageMax, _ := resChRAMUsageMax.Await()
 	resGPUsRequested, _ := resChGPUsRequested.Await()
 	resGPUsAllocated, _ := resChGPUsAllocated.Await()
-	resGPUUsageAvg, _ := resChGPUUsageAvg.Await()
 
 	resNodeCostPerCPUHr, _ := resChNodeCostPerCPUHr.Await()
 	resNodeCostPerRAMGiBHr, _ := resChNodeCostPerRAMGiBHr.Await()
@@ -319,7 +314,6 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 	applyRAMBytesUsedAvg(podMap, resRAMUsageAvg)
 	applyRAMBytesUsedMax(podMap, resRAMUsageMax)
 	applyGPUsAllocated(podMap, resGPUsRequested, resGPUsAllocated)
-	applyGPUUsageAvg(podMap, resGPUUsageAvg)
 	applyNetworkTotals(podMap, resNetTransferBytes, resNetReceiveBytes)
 	applyNetworkAllocation(podMap, resNetZoneGiB, resNetZoneCostPerGiB)
 	applyNetworkAllocation(podMap, resNetRegionGiB, resNetRegionCostPerGiB)
@@ -960,35 +954,6 @@ func applyGPUsAllocated(podMap map[podKey]*Pod, resGPUsRequested []*prom.QueryRe
 
 		hrs := pod.Allocations[container].Minutes() / 60.0
 		pod.Allocations[container].GPUHours = res.Values[0].Value * hrs
-		pod.Allocations[container].GPURequestAverage = res.Values[0].Value
-	}
-}
-
-func applyGPUUsageAvg(podMap map[podKey]*Pod, resGPUUsageAvg []*prom.QueryResult) {
-	for _, res := range resGPUUsageAvg {
-		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
-		if err != nil {
-			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg result missing field: %s", err)
-			continue
-		}
-
-		pod, ok := podMap[key]
-		if !ok {
-			continue
-		}
-		container, err := res.GetString("container")
-		if err != nil {
-			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
-			continue
-		}
-
-		if _, ok := pod.Allocations[container]; !ok {
-			pod.AppendContainer(container)
-		}
-
-		// Metric represents percentages as full numbers, so scale down by factor of 100 to make consistent
-		// with other efficiency fields
-		pod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
 	}
 }
 

+ 0 - 39
pkg/kubecost/allocation.go

@@ -61,8 +61,6 @@ type Allocation struct {
 	CPUCost                    float64               `json:"cpuCost"`
 	CPUCostAdjustment          float64               `json:"cpuCostAdjustment"`
 	GPUHours                   float64               `json:"gpuHours"`
-	GPURequestAverage          float64               `json:"gpuRequestAverage"`
-	GPUUsageAverage            float64               `json:"gpuUsageAverage"`
 	GPUCost                    float64               `json:"gpuCost"`
 	GPUCostAdjustment          float64               `json:"gpuCostAdjustment"`
 	NetworkTransferBytes       float64               `json:"networkTransferBytes"`
@@ -207,8 +205,6 @@ func (a *Allocation) Clone() *Allocation {
 		CPUCost:                    a.CPUCost,
 		CPUCostAdjustment:          a.CPUCostAdjustment,
 		GPUHours:                   a.GPUHours,
-		GPURequestAverage:          a.GPURequestAverage,
-		GPUUsageAverage:            a.GPUUsageAverage,
 		GPUCost:                    a.GPUCost,
 		GPUCostAdjustment:          a.GPUCostAdjustment,
 		NetworkTransferBytes:       a.NetworkTransferBytes,
@@ -278,12 +274,6 @@ func (a *Allocation) Equal(that *Allocation) bool {
 	if !util.IsApproximately(a.GPUHours, that.GPUHours) {
 		return false
 	}
-	if !util.IsApproximately(a.GPURequestAverage, that.GPURequestAverage) {
-		return false
-	}
-	if !util.IsApproximately(a.GPUUsageAverage, that.GPUUsageAverage) {
-		return false
-	}
 	if !util.IsApproximately(a.GPUCost, that.GPUCost) {
 		return false
 	}
@@ -446,23 +436,6 @@ func (a *Allocation) RAMEfficiency() float64 {
 	return 1.0
 }
 
-// GPUEfficiency is the ratio of usage to request. If there is no request and
-// no usage or cost, then efficiency is zero. If there is no request, but there
-// is usage or cost, then efficiency is 100%. Note that, without the Nvidia dcgm
-// providing Prometheus with usage metrics, this will always be zero, as
-// GPUUsageAverage will be zero (the default value).
-func (a *Allocation) GPUEfficiency() float64 {
-	if a.GPURequestAverage > 0 && a.GPUUsageAverage > 0 {
-		return a.GPUUsageAverage / a.GPURequestAverage
-	}
-
-	if a.GPUUsageAverage == 0.0 || a.GPUCost == 0.0 {
-		return 0.0
-	}
-
-	return 1.0
-}
-
 // TotalEfficiency is the cost-weighted average of CPU and RAM efficiency. If
 // there is no cost at all, then efficiency is zero.
 func (a *Allocation) TotalEfficiency() float64 {
@@ -535,8 +508,6 @@ func (a *Allocation) MarshalJSON() ([]byte, error) {
 	jsonEncodeFloat64(buffer, "cpuEfficiency", a.CPUEfficiency(), ",")
 	jsonEncodeFloat64(buffer, "gpuCount", a.GPUs(), ",")
 	jsonEncodeFloat64(buffer, "gpuHours", a.GPUHours, ",")
-	jsonEncodeFloat64(buffer, "gpuRequestAverage", a.GPURequestAverage, ",")
-	jsonEncodeFloat64(buffer, "gpuUsageAverage", a.GPUUsageAverage, ",")
 	jsonEncodeFloat64(buffer, "gpuCost", a.GPUCost, ",")
 	jsonEncodeFloat64(buffer, "gpuCostAdjustment", a.GPUCostAdjustment, ",")
 	jsonEncodeFloat64(buffer, "networkTransferBytes", a.NetworkTransferBytes, ",")
@@ -681,12 +652,6 @@ func (a *Allocation) add(that *Allocation) {
 	ramUseByteMins := a.RAMBytesUsageAverage * a.Minutes()
 	ramUseByteMins += that.RAMBytesUsageAverage * that.Minutes()
 
-	gpuReqMins := a.GPURequestAverage * a.Minutes()
-	gpuReqMins += that.GPURequestAverage * that.Minutes()
-
-	gpuUseMins := a.GPUUsageAverage * a.Minutes()
-	gpuUseMins += that.GPUUsageAverage * that.Minutes()
-
 	// Expand Start and End to be the "max" of among the given Allocations
 	if that.Start.Before(a.Start) {
 		a.Start = that.Start
@@ -702,15 +667,11 @@ func (a *Allocation) add(that *Allocation) {
 		a.CPUCoreUsageAverage = cpuUseCoreMins / a.Minutes()
 		a.RAMBytesRequestAverage = ramReqByteMins / a.Minutes()
 		a.RAMBytesUsageAverage = ramUseByteMins / a.Minutes()
-		a.GPURequestAverage = gpuReqMins / a.Minutes()
-		a.GPUUsageAverage = gpuUseMins / a.Minutes()
 	} else {
 		a.CPUCoreRequestAverage = 0.0
 		a.CPUCoreUsageAverage = 0.0
 		a.RAMBytesRequestAverage = 0.0
 		a.RAMBytesUsageAverage = 0.0
-		a.GPURequestAverage = 0.0
-		a.GPUUsageAverage = 0.0
 	}
 
 	// Sum all cumulative resource fields

+ 0 - 31
pkg/kubecost/allocation_test.go

@@ -49,8 +49,6 @@ func TestAllocation_Add(t *testing.T) {
 		CPUCoreRequestAverage: 2.0,
 		CPUCoreUsageAverage:   1.0,
 		CPUCost:               2.0 * hrs1 * cpuPrice,
-		GPURequestAverage:     1.0,
-		GPUUsageAverage:       0.70,
 		CPUCostAdjustment:     3.0,
 		GPUHours:              1.0 * hrs1,
 		GPUCost:               1.0 * hrs1 * gpuPrice,
@@ -85,8 +83,6 @@ func TestAllocation_Add(t *testing.T) {
 		CPUCoreUsageAverage:    1.0,
 		CPUCost:                1.0 * hrs2 * cpuPrice,
 		GPUHours:               0.0,
-		GPURequestAverage:      1.0,
-		GPUUsageAverage:        0.30,
 		GPUCost:                0.0,
 		RAMByteHours:           8.0 * gib * hrs2,
 		RAMBytesRequestAverage: 0.0,
@@ -175,8 +171,6 @@ func TestAllocation_Add(t *testing.T) {
 	// CPU usage = (1.0*12.0 + 1.0*18.0)/(24.0) = 1.25
 	// RAM requests = (8.0*12.0 + 0.0*18.0)/(24.0) = 4.00
 	// RAM usage = (4.0*12.0 + 8.0*18.0)/(24.0) = 8.00
-	// GPU requests = (1.0*12.0 + 1.0*18.0)/(24.0) = 1.25
-	// GPU usage = (0.7*12.0 + 0.3*18.0)/(24.0) = 0.575
 	if !util.IsApproximately(1.75, act.CPUCoreRequestAverage) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.75, act.CPUCoreRequestAverage)
 	}
@@ -189,17 +183,10 @@ func TestAllocation_Add(t *testing.T) {
 	if !util.IsApproximately(8.00*gib, act.RAMBytesUsageAverage) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 8.00*gib, act.RAMBytesUsageAverage)
 	}
-	if !util.IsApproximately(1.25, act.GPURequestAverage) {
-		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.25, act.GPURequestAverage)
-	}
-	if !util.IsApproximately(0.575, act.GPUUsageAverage) {
-		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.575, act.GPUUsageAverage)
-	}
 
 	// Efficiency should be computed accurately from new request/usage
 	// CPU efficiency = 1.25/1.75 = 0.7142857
 	// RAM efficiency = 8.00/4.00 = 2.0000000
-	// GPU efficiency = 0.575/1.25 = 0.46
 	// Total efficiency = (0.7142857*0.72 + 2.0*1.92)/(2.64) = 1.6493506
 	if !util.IsApproximately(0.7142857, act.CPUEfficiency()) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.7142857, act.CPUEfficiency())
@@ -207,9 +194,6 @@ func TestAllocation_Add(t *testing.T) {
 	if !util.IsApproximately(2.0000000, act.RAMEfficiency()) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 2.0000000, act.RAMEfficiency())
 	}
-	if !util.IsApproximately(0.46, act.GPUEfficiency()) {
-		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.46, act.GPUEfficiency())
-	}
 	if !util.IsApproximately(1.279690, act.TotalEfficiency()) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.279690, act.TotalEfficiency())
 	}
@@ -239,8 +223,6 @@ func TestAllocation_Share(t *testing.T) {
 		CPUCost:               2.0 * hrs1 * cpuPrice,
 		CPUCostAdjustment:     3.0,
 		GPUHours:              1.0 * hrs1,
-		GPURequestAverage:     3.0,
-		GPUUsageAverage:       0.20,
 		GPUCost:               1.0 * hrs1 * gpuPrice,
 		GPUCostAdjustment:     2.0,
 		PVs: PVAllocations{
@@ -272,8 +254,6 @@ func TestAllocation_Share(t *testing.T) {
 		CPUCoreUsageAverage:    1.0,
 		CPUCost:                1.0 * hrs2 * cpuPrice,
 		GPUHours:               0.0,
-		GPURequestAverage:      0.0,
-		GPUUsageAverage:        0.0,
 		GPUCost:                0.0,
 		RAMByteHours:           8.0 * gib * hrs2,
 		RAMBytesRequestAverage: 0.0,
@@ -362,12 +342,6 @@ func TestAllocation_Share(t *testing.T) {
 	if !util.IsApproximately(a1.RAMBytesUsageAverage, act.RAMBytesUsageAverage) {
 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.RAMBytesUsageAverage, act.RAMBytesUsageAverage)
 	}
-	if !util.IsApproximately(a1.GPURequestAverage, act.GPURequestAverage) {
-		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPURequestAverage, act.GPURequestAverage)
-	}
-	if !util.IsApproximately(a1.GPUUsageAverage, act.GPUUsageAverage) {
-		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPUUsageAverage, act.GPUUsageAverage)
-	}
 
 	// Efficiency should match before
 	if !util.IsApproximately(a1.CPUEfficiency(), act.CPUEfficiency()) {
@@ -376,9 +350,6 @@ func TestAllocation_Share(t *testing.T) {
 	if !util.IsApproximately(a1.RAMEfficiency(), act.RAMEfficiency()) {
 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.RAMEfficiency(), act.RAMEfficiency())
 	}
-	if !util.IsApproximately(a1.GPUEfficiency(), act.GPUEfficiency()) {
-		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPUEfficiency(), act.GPUEfficiency())
-	}
 	if !util.IsApproximately(a1.TotalEfficiency(), act.TotalEfficiency()) {
 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.TotalEfficiency(), act.TotalEfficiency())
 	}
@@ -438,8 +409,6 @@ func TestAllocation_MarshalJSON(t *testing.T) {
 		CPUCost:               2.0 * hrs * cpuPrice,
 		CPUCostAdjustment:     3.0,
 		GPUHours:              1.0 * hrs,
-		GPURequestAverage:     1.0,
-		GPUUsageAverage:       0.70,
 		GPUCost:               1.0 * hrs * gpuPrice,
 		GPUCostAdjustment:     2.0,
 		NetworkCost:           0.05,

+ 2 - 2
pkg/kubecost/bingen.go

@@ -23,7 +23,7 @@ package kubecost
 // @bingen:end
 
 // Allocation Version Set: Includes Allocation pipeline specific resources
-// @bingen:set[name=Allocation,version=16]
+// @bingen:set[name=Allocation,version=15]
 // @bingen:generate:Allocation
 // @bingen:generate[stringtable]:AllocationSet
 // @bingen:generate:AllocationSetRange
@@ -37,4 +37,4 @@ package kubecost
 // @bingen:generate:PVAllocation
 // @bingen:end
 
-//go:generate bingen -package=kubecost -version=16 -buffer=github.com/kubecost/cost-model/pkg/util
+//go:generate bingen -package=kubecost -version=15 -buffer=github.com/kubecost/cost-model/pkg/util