Просмотр исходного кода

Merge pull request #944 from kubecost/kaelan-gpuefficiency

Support Allocation GPU utilization/efficiency through integration with Nvidia GPU Operator/DCGM.
Kaelan Patel 4 лет назад
Родитель
Сommit
eb8e55ec05
4 измененных файлов с 107 добавлено и 2 удалено
  1. 35 0
      pkg/costmodel/allocation.go
  2. 39 0
      pkg/kubecost/allocation.go
  3. 31 0
      pkg/kubecost/allocation_test.go
  4. 2 2
      pkg/kubecost/bingen.go

+ 35 - 0
pkg/costmodel/allocation.go

@@ -30,6 +30,7 @@ const (
 	queryFmtCPUUsageMax              = `max(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD"}[%s]%s)) by (container_name, container, pod_name, pod, namespace, instance, %s)`
 	queryFmtGPUsRequested            = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, %s)`
 	queryFmtGPUsAllocated            = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, %s)`
+	queryFmtGPUUsageAvg              = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s]%s)) by (container, pod, namespace, %s)`
 	queryFmtNodeCostPerCPUHr         = `avg(avg_over_time(node_cpu_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
 	queryFmtNodeCostPerRAMGiBHr      = `avg(avg_over_time(node_ram_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
 	queryFmtNodeCostPerGPUHr         = `avg(avg_over_time(node_gpu_hourly_cost[%s]%s)) by (node, %s, instance_type, provider_id)`
@@ -158,6 +159,9 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 	queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, durStr, offStr, env.GetPromClusterLabel())
 	resChGPUsAllocated := ctx.Query(queryGPUsAllocated)
 
+	queryGPUUsageAvg := fmt.Sprintf(queryFmtGPUUsageAvg, durStr, offStr, env.GetPromClusterLabel())
+	resChGPUUsageAvg := ctx.Query(queryGPUUsageAvg)
+
 	queryNodeCostPerCPUHr := fmt.Sprintf(queryFmtNodeCostPerCPUHr, durStr, offStr, env.GetPromClusterLabel())
 	resChNodeCostPerCPUHr := ctx.Query(queryNodeCostPerCPUHr)
 
@@ -258,6 +262,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 	resRAMUsageMax, _ := resChRAMUsageMax.Await()
 	resGPUsRequested, _ := resChGPUsRequested.Await()
 	resGPUsAllocated, _ := resChGPUsAllocated.Await()
+	resGPUUsageAvg, _ := resChGPUUsageAvg.Await()
 
 	resNodeCostPerCPUHr, _ := resChNodeCostPerCPUHr.Await()
 	resNodeCostPerRAMGiBHr, _ := resChNodeCostPerRAMGiBHr.Await()
@@ -314,6 +319,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time, resolution time.Dur
 	applyRAMBytesUsedAvg(podMap, resRAMUsageAvg)
 	applyRAMBytesUsedMax(podMap, resRAMUsageMax)
 	applyGPUsAllocated(podMap, resGPUsRequested, resGPUsAllocated)
+	applyGPUUsageAvg(podMap, resGPUUsageAvg)
 	applyNetworkTotals(podMap, resNetTransferBytes, resNetReceiveBytes)
 	applyNetworkAllocation(podMap, resNetZoneGiB, resNetZoneCostPerGiB)
 	applyNetworkAllocation(podMap, resNetRegionGiB, resNetRegionCostPerGiB)
@@ -954,6 +960,35 @@ func applyGPUsAllocated(podMap map[podKey]*Pod, resGPUsRequested []*prom.QueryRe
 
 		hrs := pod.Allocations[container].Minutes() / 60.0
 		pod.Allocations[container].GPUHours = res.Values[0].Value * hrs
+		pod.Allocations[container].GPURequestAverage = res.Values[0].Value
+	}
+}
+
+func applyGPUUsageAvg(podMap map[podKey]*Pod, resGPUUsageAvg []*prom.QueryResult) {
+	for _, res := range resGPUUsageAvg {
+		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
+		if err != nil {
+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg result missing field: %s", err)
+			continue
+		}
+
+		pod, ok := podMap[key]
+		if !ok {
+			continue
+		}
+		container, err := res.GetString("container")
+		if err != nil {
+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
+			continue
+		}
+
+		if _, ok := pod.Allocations[container]; !ok {
+			pod.AppendContainer(container)
+		}
+
+		// Metric represents percentages as full numbers, so scale down by factor of 100 to make consistent
+		// with other efficiency fields
+		pod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
 	}
 }
 

+ 39 - 0
pkg/kubecost/allocation.go

@@ -61,6 +61,8 @@ type Allocation struct {
 	CPUCost                    float64               `json:"cpuCost"`
 	CPUCostAdjustment          float64               `json:"cpuCostAdjustment"`
 	GPUHours                   float64               `json:"gpuHours"`
+	GPURequestAverage          float64               `json:"gpuRequestAverage"`
+	GPUUsageAverage            float64               `json:"gpuUsageAverage"`
 	GPUCost                    float64               `json:"gpuCost"`
 	GPUCostAdjustment          float64               `json:"gpuCostAdjustment"`
 	NetworkTransferBytes       float64               `json:"networkTransferBytes"`
@@ -205,6 +207,8 @@ func (a *Allocation) Clone() *Allocation {
 		CPUCost:                    a.CPUCost,
 		CPUCostAdjustment:          a.CPUCostAdjustment,
 		GPUHours:                   a.GPUHours,
+		GPURequestAverage:          a.GPURequestAverage,
+		GPUUsageAverage:            a.GPUUsageAverage,
 		GPUCost:                    a.GPUCost,
 		GPUCostAdjustment:          a.GPUCostAdjustment,
 		NetworkTransferBytes:       a.NetworkTransferBytes,
@@ -274,6 +278,12 @@ func (a *Allocation) Equal(that *Allocation) bool {
 	if !util.IsApproximately(a.GPUHours, that.GPUHours) {
 		return false
 	}
+	if !util.IsApproximately(a.GPURequestAverage, that.GPURequestAverage) {
+		return false
+	}
+	if !util.IsApproximately(a.GPUUsageAverage, that.GPUUsageAverage) {
+		return false
+	}
 	if !util.IsApproximately(a.GPUCost, that.GPUCost) {
 		return false
 	}
@@ -436,6 +446,23 @@ func (a *Allocation) RAMEfficiency() float64 {
 	return 1.0
 }
 
+// GPUEfficiency is the ratio of usage to request. If there is no request and
+// no usage or cost, then efficiency is zero. If there is no request, but there
+// is usage or cost, then efficiency is 100%. Note that, without the Nvidia dcgm
+// providing Prometheus with usage metrics, this will always be zero, as
+// GPUUsageAverage will be zero (the default value).
+func (a *Allocation) GPUEfficiency() float64 {
+	if a.GPURequestAverage > 0 && a.GPUUsageAverage > 0 {
+		return a.GPUUsageAverage / a.GPURequestAverage
+	}
+
+	if a.GPUUsageAverage == 0.0 || a.GPUCost == 0.0 {
+		return 0.0
+	}
+
+	return 1.0
+}
+
 // TotalEfficiency is the cost-weighted average of CPU and RAM efficiency. If
 // there is no cost at all, then efficiency is zero.
 func (a *Allocation) TotalEfficiency() float64 {
@@ -508,6 +535,8 @@ func (a *Allocation) MarshalJSON() ([]byte, error) {
 	jsonEncodeFloat64(buffer, "cpuEfficiency", a.CPUEfficiency(), ",")
 	jsonEncodeFloat64(buffer, "gpuCount", a.GPUs(), ",")
 	jsonEncodeFloat64(buffer, "gpuHours", a.GPUHours, ",")
+	jsonEncodeFloat64(buffer, "gpuRequestAverage", a.GPURequestAverage, ",")
+	jsonEncodeFloat64(buffer, "gpuUsageAverage", a.GPUUsageAverage, ",")
 	jsonEncodeFloat64(buffer, "gpuCost", a.GPUCost, ",")
 	jsonEncodeFloat64(buffer, "gpuCostAdjustment", a.GPUCostAdjustment, ",")
 	jsonEncodeFloat64(buffer, "networkTransferBytes", a.NetworkTransferBytes, ",")
@@ -652,6 +681,12 @@ func (a *Allocation) add(that *Allocation) {
 	ramUseByteMins := a.RAMBytesUsageAverage * a.Minutes()
 	ramUseByteMins += that.RAMBytesUsageAverage * that.Minutes()
 
+	gpuReqMins := a.GPURequestAverage * a.Minutes()
+	gpuReqMins += that.GPURequestAverage * that.Minutes()
+
+	gpuUseMins := a.GPUUsageAverage * a.Minutes()
+	gpuUseMins += that.GPUUsageAverage * that.Minutes()
+
 	// Expand Start and End to be the "max" of among the given Allocations
 	if that.Start.Before(a.Start) {
 		a.Start = that.Start
@@ -667,11 +702,15 @@ func (a *Allocation) add(that *Allocation) {
 		a.CPUCoreUsageAverage = cpuUseCoreMins / a.Minutes()
 		a.RAMBytesRequestAverage = ramReqByteMins / a.Minutes()
 		a.RAMBytesUsageAverage = ramUseByteMins / a.Minutes()
+		a.GPURequestAverage = gpuReqMins / a.Minutes()
+		a.GPUUsageAverage = gpuUseMins / a.Minutes()
 	} else {
 		a.CPUCoreRequestAverage = 0.0
 		a.CPUCoreUsageAverage = 0.0
 		a.RAMBytesRequestAverage = 0.0
 		a.RAMBytesUsageAverage = 0.0
+		a.GPURequestAverage = 0.0
+		a.GPUUsageAverage = 0.0
 	}
 
 	// Sum all cumulative resource fields

+ 31 - 0
pkg/kubecost/allocation_test.go

@@ -49,6 +49,8 @@ func TestAllocation_Add(t *testing.T) {
 		CPUCoreRequestAverage: 2.0,
 		CPUCoreUsageAverage:   1.0,
 		CPUCost:               2.0 * hrs1 * cpuPrice,
+		GPURequestAverage:     1.0,
+		GPUUsageAverage:       0.70,
 		CPUCostAdjustment:     3.0,
 		GPUHours:              1.0 * hrs1,
 		GPUCost:               1.0 * hrs1 * gpuPrice,
@@ -83,6 +85,8 @@ func TestAllocation_Add(t *testing.T) {
 		CPUCoreUsageAverage:    1.0,
 		CPUCost:                1.0 * hrs2 * cpuPrice,
 		GPUHours:               0.0,
+		GPURequestAverage:      1.0,
+		GPUUsageAverage:        0.30,
 		GPUCost:                0.0,
 		RAMByteHours:           8.0 * gib * hrs2,
 		RAMBytesRequestAverage: 0.0,
@@ -171,6 +175,8 @@ func TestAllocation_Add(t *testing.T) {
 	// CPU usage = (1.0*12.0 + 1.0*18.0)/(24.0) = 1.25
 	// RAM requests = (8.0*12.0 + 0.0*18.0)/(24.0) = 4.00
 	// RAM usage = (4.0*12.0 + 8.0*18.0)/(24.0) = 8.00
+	// GPU requests = (1.0*12.0 + 1.0*18.0)/(24.0) = 1.25
+	// GPU usage = (0.7*12.0 + 0.3*18.0)/(24.0) = 0.575
 	if !util.IsApproximately(1.75, act.CPUCoreRequestAverage) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.75, act.CPUCoreRequestAverage)
 	}
@@ -183,10 +189,17 @@ func TestAllocation_Add(t *testing.T) {
 	if !util.IsApproximately(8.00*gib, act.RAMBytesUsageAverage) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 8.00*gib, act.RAMBytesUsageAverage)
 	}
+	if !util.IsApproximately(1.25, act.GPURequestAverage) {
+		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.25, act.GPURequestAverage)
+	}
+	if !util.IsApproximately(0.575, act.GPUUsageAverage) {
+		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.575, act.GPUUsageAverage)
+	}
 
 	// Efficiency should be computed accurately from new request/usage
 	// CPU efficiency = 1.25/1.75 = 0.7142857
 	// RAM efficiency = 8.00/4.00 = 2.0000000
+	// GPU efficiency = 0.575/1.25 = 0.46
 	// Total efficiency = (0.7142857*0.72 + 2.0*1.92)/(2.64) = 1.6493506
 	if !util.IsApproximately(0.7142857, act.CPUEfficiency()) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.7142857, act.CPUEfficiency())
@@ -194,6 +207,9 @@ func TestAllocation_Add(t *testing.T) {
 	if !util.IsApproximately(2.0000000, act.RAMEfficiency()) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 2.0000000, act.RAMEfficiency())
 	}
+	if !util.IsApproximately(0.46, act.GPUEfficiency()) {
+		t.Fatalf("Allocation.Add: expected %f; actual %f", 0.46, act.GPUEfficiency())
+	}
 	if !util.IsApproximately(1.279690, act.TotalEfficiency()) {
 		t.Fatalf("Allocation.Add: expected %f; actual %f", 1.279690, act.TotalEfficiency())
 	}
@@ -223,6 +239,8 @@ func TestAllocation_Share(t *testing.T) {
 		CPUCost:               2.0 * hrs1 * cpuPrice,
 		CPUCostAdjustment:     3.0,
 		GPUHours:              1.0 * hrs1,
+		GPURequestAverage:     3.0,
+		GPUUsageAverage:       0.20,
 		GPUCost:               1.0 * hrs1 * gpuPrice,
 		GPUCostAdjustment:     2.0,
 		PVs: PVAllocations{
@@ -254,6 +272,8 @@ func TestAllocation_Share(t *testing.T) {
 		CPUCoreUsageAverage:    1.0,
 		CPUCost:                1.0 * hrs2 * cpuPrice,
 		GPUHours:               0.0,
+		GPURequestAverage:      0.0,
+		GPUUsageAverage:        0.0,
 		GPUCost:                0.0,
 		RAMByteHours:           8.0 * gib * hrs2,
 		RAMBytesRequestAverage: 0.0,
@@ -342,6 +362,12 @@ func TestAllocation_Share(t *testing.T) {
 	if !util.IsApproximately(a1.RAMBytesUsageAverage, act.RAMBytesUsageAverage) {
 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.RAMBytesUsageAverage, act.RAMBytesUsageAverage)
 	}
+	if !util.IsApproximately(a1.GPURequestAverage, act.GPURequestAverage) {
+		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPURequestAverage, act.GPURequestAverage)
+	}
+	if !util.IsApproximately(a1.GPUUsageAverage, act.GPUUsageAverage) {
+		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPUUsageAverage, act.GPUUsageAverage)
+	}
 
 	// Efficiency should match before
 	if !util.IsApproximately(a1.CPUEfficiency(), act.CPUEfficiency()) {
@@ -350,6 +376,9 @@ func TestAllocation_Share(t *testing.T) {
 	if !util.IsApproximately(a1.RAMEfficiency(), act.RAMEfficiency()) {
 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.RAMEfficiency(), act.RAMEfficiency())
 	}
+	if !util.IsApproximately(a1.GPUEfficiency(), act.GPUEfficiency()) {
+		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.GPUEfficiency(), act.GPUEfficiency())
+	}
 	if !util.IsApproximately(a1.TotalEfficiency(), act.TotalEfficiency()) {
 		t.Fatalf("Allocation.Share: expected %f; actual %f", a1.TotalEfficiency(), act.TotalEfficiency())
 	}
@@ -409,6 +438,8 @@ func TestAllocation_MarshalJSON(t *testing.T) {
 		CPUCost:               2.0 * hrs * cpuPrice,
 		CPUCostAdjustment:     3.0,
 		GPUHours:              1.0 * hrs,
+		GPURequestAverage:     1.0,
+		GPUUsageAverage:       0.70,
 		GPUCost:               1.0 * hrs * gpuPrice,
 		GPUCostAdjustment:     2.0,
 		NetworkCost:           0.05,

+ 2 - 2
pkg/kubecost/bingen.go

@@ -23,7 +23,7 @@ package kubecost
 // @bingen:end
 
 // Allocation Version Set: Includes Allocation pipeline specific resources
-// @bingen:set[name=Allocation,version=15]
+// @bingen:set[name=Allocation,version=16]
 // @bingen:generate:Allocation
 // @bingen:generate[stringtable]:AllocationSet
 // @bingen:generate:AllocationSetRange
@@ -37,4 +37,4 @@ package kubecost
 // @bingen:generate:PVAllocation
 // @bingen:end
 
-//go:generate bingen -package=kubecost -version=15 -buffer=github.com/kubecost/cost-model/pkg/util
+//go:generate bingen -package=kubecost -version=16 -buffer=github.com/kubecost/cost-model/pkg/util