Ver código fonte

Merge pull request #2731 from thomasvn/thomasn/gpu-efficiency

Reintroduce GPU Usage & Efficiency
Ajay Tripathy 1 ano atrás
pai
commit
fb03ac5219

+ 42 - 1
core/pkg/opencost/allocation.go

@@ -97,7 +97,9 @@ type Allocation struct {
 	// UnmountedPVCost is used to track how much of the cost in PVs is for an
 	// unmounted PV. It is not additive of PVCost() and need not be sent in API
 	// responses.
-	UnmountedPVCost float64 `json:"-"` //@bingen:field[ignore]
+	UnmountedPVCost   float64 `json:"-"`                 //@bingen:field[ignore]
+	GPURequestAverage float64 `json:"gpuRequestAverage"` //@bingen:field[version=22]
+	GPUUsageAverage   float64 `json:"gpuUsageAverage"`   //@bingen:field[version=22]
 }
 
 type LbAllocations map[string]*LbAllocation
@@ -669,6 +671,8 @@ func (a *Allocation) Clone() *Allocation {
 		CPUCost:                        a.CPUCost,
 		CPUCostAdjustment:              a.CPUCostAdjustment,
 		GPUHours:                       a.GPUHours,
+		GPURequestAverage:              a.GPURequestAverage,
+		GPUUsageAverage:                a.GPUUsageAverage,
 		GPUCost:                        a.GPUCost,
 		GPUCostAdjustment:              a.GPUCostAdjustment,
 		NetworkTransferBytes:           a.NetworkTransferBytes,
@@ -942,6 +946,25 @@ func (a *Allocation) RAMEfficiency() float64 {
 	return 1.0
 }
 
+// GPUEfficiency is the ratio of usage to request. Note that, without the NVIDIA
+// DCGM exporter providing Prometheus with usage metrics, this will always be
+// zero, as GPUUsageAverage will be zero (the default value).
+func (a *Allocation) GPUEfficiency() float64 {
+	if a == nil {
+		return 0.0
+	}
+
+	if a.GPURequestAverage > 0 && a.GPUUsageAverage > 0 {
+		return a.GPUUsageAverage / a.GPURequestAverage
+	}
+
+	if a.GPUUsageAverage == 0.0 || a.GPUCost == 0.0 {
+		return 0.0
+	}
+
+	return 1.0
+}
+
 // TotalEfficiency is the cost-weighted average of CPU and RAM efficiency. If
 // there is no cost at all, then efficiency is zero.
 func (a *Allocation) TotalEfficiency() float64 {
@@ -1183,6 +1206,12 @@ func (a *Allocation) add(that *Allocation) {
 	ramUseByteMins := a.RAMBytesUsageAverage * a.Minutes()
 	ramUseByteMins += that.RAMBytesUsageAverage * that.Minutes()
 
+	gpuReqMins := a.GPURequestAverage * a.Minutes()
+	gpuReqMins += that.GPURequestAverage * that.Minutes()
+
+	gpuUseMins := a.GPUUsageAverage * a.Minutes()
+	gpuUseMins += that.GPUUsageAverage * that.Minutes()
+
 	// Expand Start and End to be the "max" of among the given Allocations
 	if that.Start.Before(a.Start) {
 		a.Start = that.Start
@@ -1198,11 +1227,15 @@ func (a *Allocation) add(that *Allocation) {
 		a.CPUCoreUsageAverage = cpuUseCoreMins / a.Minutes()
 		a.RAMBytesRequestAverage = ramReqByteMins / a.Minutes()
 		a.RAMBytesUsageAverage = ramUseByteMins / a.Minutes()
+		a.GPURequestAverage = gpuReqMins / a.Minutes()
+		a.GPUUsageAverage = gpuUseMins / a.Minutes()
 	} else {
 		a.CPUCoreRequestAverage = 0.0
 		a.CPUCoreUsageAverage = 0.0
 		a.RAMBytesRequestAverage = 0.0
 		a.RAMBytesUsageAverage = 0.0
+		a.GPURequestAverage = 0.0
+		a.GPUUsageAverage = 0.0
 	}
 
 	// Sum all cumulative resource fields
@@ -2532,6 +2565,14 @@ func (a *Allocation) SanitizeNaN() {
 		log.DedupedWarningf(5, "Allocation: Unexpected NaN found for GPUHours name:%s, window:%s, properties:%s", a.Name, a.Window.String(), a.Properties.String())
 		a.GPUHours = 0
 	}
+	if math.IsNaN(a.GPURequestAverage) {
+		log.DedupedWarningf(5, "Allocation: Unexpected NaN found for GPURequestAverage name:%s, window:%s, properties:%s", a.Name, a.Window.String(), a.Properties.String())
+		a.GPURequestAverage = 0
+	}
+	if math.IsNaN(a.GPUUsageAverage) {
+		log.DedupedWarningf(5, "Allocation: Unexpected NaN found for GPUUsageAverage name:%s, window:%s, properties:%s", a.Name, a.Window.String(), a.Properties.String())
+		a.GPUUsageAverage = 0
+	}
 	if math.IsNaN(a.GPUCost) {
 		log.DedupedWarningf(5, "Allocation: Unexpected NaN found for GPUCost name:%s, window:%s, properties:%s", a.Name, a.Window.String(), a.Properties.String())
 		a.GPUCost = 0

+ 6 - 0
core/pkg/opencost/allocation_json.go

@@ -25,9 +25,12 @@ type AllocationJSON struct {
 	CPUCostAdjustment              *float64                        `json:"cpuCostAdjustment"`
 	CPUEfficiency                  *float64                        `json:"cpuEfficiency"`
 	GPUCount                       *float64                        `json:"gpuCount"`
+	GPURequestAverage              *float64                        `json:"gpuRequestAverage"`
+	GPUUsageAverage                *float64                        `json:"gpuUsageAverage"`
 	GPUHours                       *float64                        `json:"gpuHours"`
 	GPUCost                        *float64                        `json:"gpuCost"`
 	GPUCostAdjustment              *float64                        `json:"gpuCostAdjustment"`
+	GPUEfficiency                  *float64                        `json:"gpuEfficiency"`
 	NetworkTransferBytes           *float64                        `json:"networkTransferBytes"`
 	NetworkReceiveBytes            *float64                        `json:"networkReceiveBytes"`
 	NetworkCost                    *float64                        `json:"networkCost"`
@@ -77,9 +80,12 @@ func (aj *AllocationJSON) BuildFromAllocation(a *Allocation) {
 	aj.CPUCostAdjustment = formatFloat64ForResponse(a.CPUCostAdjustment)
 	aj.CPUEfficiency = formatFloat64ForResponse(a.CPUEfficiency())
 	aj.GPUCount = formatFloat64ForResponse(a.GPUs())
+	aj.GPURequestAverage = formatFloat64ForResponse(a.GPURequestAverage)
+	aj.GPUUsageAverage = formatFloat64ForResponse(a.GPUUsageAverage)
 	aj.GPUHours = formatFloat64ForResponse(a.GPUHours)
 	aj.GPUCost = formatFloat64ForResponse(a.GPUCost)
 	aj.GPUCostAdjustment = formatFloat64ForResponse(a.GPUCostAdjustment)
+	aj.GPUEfficiency = formatFloat64ForResponse(a.GPUEfficiency())
 	aj.NetworkTransferBytes = formatFloat64ForResponse(a.NetworkTransferBytes)
 	aj.NetworkReceiveBytes = formatFloat64ForResponse(a.NetworkReceiveBytes)
 	aj.NetworkCost = formatFloat64ForResponse(a.NetworkCost)

+ 1 - 1
core/pkg/opencost/bingen.go

@@ -46,7 +46,7 @@ package opencost
 // @bingen:end
 
 // Allocation Version Set: Includes Allocation pipeline specific resources
-// @bingen:set[name=Allocation,version=21]
+// @bingen:set[name=Allocation,version=22]
 // @bingen:generate:Allocation
 // @bingen:generate[stringtable]:AllocationSet
 // @bingen:generate:AllocationSetRange

+ 21 - 1
core/pkg/opencost/opencost_codecs.go

@@ -41,7 +41,7 @@ const (
 	AssetsCodecVersion uint8 = 21
 
 	// AllocationCodecVersion is used for any resources listed in the Allocation version set
-	AllocationCodecVersion uint8 = 21
+	AllocationCodecVersion uint8 = 22
 
 	// CloudCostCodecVersion is used for any resources listed in the CloudCost version set
 	CloudCostCodecVersion uint8 = 2
@@ -457,6 +457,8 @@ func (target *Allocation) MarshalBinaryWithContext(ctx *EncodingContext) (err er
 	}
 	// --- [end][write][alias](LbAllocations) ---
 
+	buff.WriteFloat64(target.GPURequestAverage) // write float64
+	buff.WriteFloat64(target.GPUUsageAverage)   // write float64
 	return nil
 }
 
@@ -769,6 +771,24 @@ func (target *Allocation) UnmarshalBinaryWithContext(ctx *DecodingContext) (err
 	} else {
 	}
 
+	// field version check
+	if uint8(22) <= version {
+		fff := buff.ReadFloat64() // read float64
+		target.GPURequestAverage = fff
+
+	} else {
+		target.GPURequestAverage = float64(0) // default
+	}
+
+	// field version check
+	if uint8(22) <= version {
+		ggg := buff.ReadFloat64() // read float64
+		target.GPUUsageAverage = ggg
+
+	} else {
+		target.GPUUsageAverage = float64(0) // default
+	}
+
 	return nil
 }
 

+ 24 - 0
core/pkg/opencost/summaryallocation.go

@@ -29,6 +29,8 @@ type SummaryAllocation struct {
 	CPUCoreRequestAverage  float64               `json:"cpuCoreRequestAverage"`
 	CPUCoreUsageAverage    float64               `json:"cpuCoreUsageAverage"`
 	CPUCost                float64               `json:"cpuCost"`
+	GPURequestAverage      float64               `json:"gpuRequestAverage"`
+	GPUUsageAverage        float64               `json:"gpuUsageAverage"`
 	GPUCost                float64               `json:"gpuCost"`
 	NetworkCost            float64               `json:"networkCost"`
 	LoadBalancerCost       float64               `json:"loadBalancerCost"`
@@ -60,6 +62,8 @@ func NewSummaryAllocation(alloc *Allocation, reconcile, reconcileNetwork bool) *
 		CPUCoreRequestAverage:  alloc.CPUCoreRequestAverage,
 		CPUCoreUsageAverage:    alloc.CPUCoreUsageAverage,
 		CPUCost:                alloc.CPUCost + alloc.CPUCostAdjustment,
+		GPURequestAverage:      alloc.GPURequestAverage,
+		GPUUsageAverage:        alloc.GPUUsageAverage,
 		GPUCost:                alloc.GPUCost + alloc.GPUCostAdjustment,
 		NetworkCost:            alloc.NetworkCost + alloc.NetworkCostAdjustment,
 		LoadBalancerCost:       alloc.LoadBalancerCost + alloc.LoadBalancerCostAdjustment,
@@ -121,6 +125,12 @@ func (sa *SummaryAllocation) Add(that *SummaryAllocation) error {
 	ramUseByteMins := sa.RAMBytesUsageAverage * sa.Minutes()
 	ramUseByteMins += that.RAMBytesUsageAverage * that.Minutes()
 
+	gpuReqMins := sa.GPURequestAverage * sa.Minutes()
+	gpuReqMins += that.GPURequestAverage * that.Minutes()
+
+	gpuUseMins := sa.GPUUsageAverage * sa.Minutes()
+	gpuUseMins += that.GPUUsageAverage * that.Minutes()
+
 	// Expand Start and End to be the "max" of among the given Allocations
 	if that.Start.Before(sa.Start) {
 		sa.Start = that.Start
@@ -135,11 +145,15 @@ func (sa *SummaryAllocation) Add(that *SummaryAllocation) error {
 		sa.CPUCoreUsageAverage = cpuUseCoreMins / sa.Minutes()
 		sa.RAMBytesRequestAverage = ramReqByteMins / sa.Minutes()
 		sa.RAMBytesUsageAverage = ramUseByteMins / sa.Minutes()
+		sa.GPURequestAverage = gpuReqMins / sa.Minutes()
+		sa.GPUUsageAverage = gpuUseMins / sa.Minutes()
 	} else {
 		sa.CPUCoreRequestAverage = 0.0
 		sa.CPUCoreUsageAverage = 0.0
 		sa.RAMBytesRequestAverage = 0.0
 		sa.RAMBytesUsageAverage = 0.0
+		sa.GPURequestAverage = 0.0
+		sa.GPUUsageAverage = 0.0
 	}
 
 	// Sum all cumulative cost fields
@@ -166,6 +180,8 @@ func (sa *SummaryAllocation) Clone() *SummaryAllocation {
 		CPUCoreRequestAverage:  sa.CPUCoreRequestAverage,
 		CPUCoreUsageAverage:    sa.CPUCoreUsageAverage,
 		CPUCost:                sa.CPUCost,
+		GPURequestAverage:      sa.GPURequestAverage,
+		GPUUsageAverage:        sa.GPUUsageAverage,
 		GPUCost:                sa.GPUCost,
 		NetworkCost:            sa.NetworkCost,
 		LoadBalancerCost:       sa.LoadBalancerCost,
@@ -227,6 +243,14 @@ func (sa *SummaryAllocation) Equal(that *SummaryAllocation) bool {
 		return false
 	}
 
+	if sa.GPURequestAverage != that.GPURequestAverage {
+		return false
+	}
+
+	if sa.GPUUsageAverage != that.GPUUsageAverage {
+		return false
+	}
+
 	if sa.GPUCost != that.GPUCost {
 		return false
 	}

+ 4 - 0
core/pkg/opencost/summaryallocation_json.go

@@ -15,6 +15,8 @@ type SummaryAllocationResponse struct {
 	CPUCoreRequestAverage  *float64  `json:"cpuCoreRequestAverage"`
 	CPUCoreUsageAverage    *float64  `json:"cpuCoreUsageAverage"`
 	CPUCost                *float64  `json:"cpuCost"`
+	GPURequestAverage      *float64  `json:"gpuRequestAverage"`
+	GPUUsageAverage        *float64  `json:"gpuUsageAverage"`
 	GPUCost                *float64  `json:"gpuCost"`
 	NetworkCost            *float64  `json:"networkCost"`
 	LoadBalancerCost       *float64  `json:"loadBalancerCost"`
@@ -52,6 +54,8 @@ func (sa *SummaryAllocation) ToResponse() *SummaryAllocationResponse {
 		CPUCoreRequestAverage:  formatutil.Float64ToResponse(sa.CPUCoreRequestAverage),
 		CPUCoreUsageAverage:    formatutil.Float64ToResponse(sa.CPUCoreUsageAverage),
 		CPUCost:                formatutil.Float64ToResponse(sa.CPUCost),
+		GPURequestAverage:      formatutil.Float64ToResponse(sa.GPURequestAverage),
+		GPUUsageAverage:        formatutil.Float64ToResponse(sa.GPUUsageAverage),
 		GPUCost:                formatutil.Float64ToResponse(sa.GPUCost),
 		NetworkCost:            formatutil.Float64ToResponse(sa.NetworkCost),
 		LoadBalancerCost:       formatutil.Float64ToResponse(sa.LoadBalancerCost),

+ 6 - 0
pkg/costmodel/allocation.go

@@ -28,6 +28,7 @@ const (
 	queryFmtCPURequests                 = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtCPUUsageAvg                 = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
 	queryFmtGPUsRequested               = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
+	queryFmtGPUsUsageAvg                = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s])) by (container, pod, namespace, %s)`
 	queryFmtGPUsAllocated               = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtNodeCostPerCPUHr            = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
 	queryFmtNodeCostPerRAMGiBHr         = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
@@ -428,6 +429,9 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
 	resChGPUsRequested := ctx.QueryAtTime(queryGPUsRequested, end)
 
+	queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, env.GetPromClusterLabel())
+	resChGPUsUsageAvg := ctx.Query(queryGPUsUsageAvg)
+
 	queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
 	resChGPUsAllocated := ctx.QueryAtTime(queryGPUsAllocated, end)
 
@@ -544,6 +548,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	resRAMUsageAvg, _ := resChRAMUsageAvg.Await()
 	resRAMUsageMax, _ := resChRAMUsageMax.Await()
 	resGPUsRequested, _ := resChGPUsRequested.Await()
+	resGPUsUsageAvg, _ := resChGPUsUsageAvg.Await()
 	resGPUsAllocated, _ := resChGPUsAllocated.Await()
 
 	resNodeCostPerCPUHr, _ := resChNodeCostPerCPUHr.Await()
@@ -610,6 +615,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	applyRAMBytesRequested(podMap, resRAMRequests, podUIDKeyMap)
 	applyRAMBytesUsedAvg(podMap, resRAMUsageAvg, podUIDKeyMap)
 	applyRAMBytesUsedMax(podMap, resRAMUsageMax, podUIDKeyMap)
+	applyGPUUsageAvg(podMap, resGPUsUsageAvg, podUIDKeyMap)
 	applyGPUsAllocated(podMap, resGPUsRequested, resGPUsAllocated, podUIDKeyMap)
 	applyNetworkTotals(podMap, resNetTransferBytes, resNetReceiveBytes, podUIDKeyMap)
 	applyNetworkAllocation(podMap, resNetZoneGiB, resNetZoneCostPerGiB, podUIDKeyMap, networkCrossZoneCost)

+ 49 - 0
pkg/costmodel/allocation_helpers.go

@@ -614,6 +614,48 @@ func applyRAMBytesUsedMax(podMap map[podKey]*pod, resRAMBytesUsedMax []*prom.Que
 	}
 }
 
+func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
+	// Example PromQueryResult: {container="dcgmproftester12", namespace="gpu", pod="dcgmproftester3-deployment-fc89c8dd6-ph7z5"} 99
+	for _, res := range resGPUUsageAvg {
+		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
+		if err != nil {
+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg result missing field: %s", err)
+			continue
+		}
+
+		var pods []*pod
+		if thisPod, ok := podMap[key]; !ok {
+			if uidKeys, ok := podUIDKeyMap[key]; ok {
+				for _, uidKey := range uidKeys {
+					thisPod, ok = podMap[uidKey]
+					if ok {
+						pods = append(pods, thisPod)
+					}
+				}
+			} else {
+				continue
+			}
+		} else {
+			pods = []*pod{thisPod}
+		}
+
+		for _, thisPod := range pods {
+			container, err := res.GetString("container")
+			if err != nil {
+				log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
+				continue
+			}
+			if _, ok := thisPod.Allocations[container]; !ok {
+				thisPod.appendContainer(container)
+			}
+
+			// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
+			// percentage so it is consistent with other fields.
+			thisPod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
+		}
+	}
+}
+
 func applyGPUsAllocated(podMap map[podKey]*pod, resGPUsRequested []*prom.QueryResult, resGPUsAllocated []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
 	if len(resGPUsAllocated) > 0 { // Use the new query, when it's become available in a window
 		resGPUsRequested = resGPUsAllocated
@@ -655,6 +697,13 @@ func applyGPUsAllocated(podMap map[podKey]*pod, resGPUsRequested []*prom.QueryRe
 
 			hrs := thisPod.Allocations[container].Minutes() / 60.0
 			thisPod.Allocations[container].GPUHours = res.Values[0].Value * hrs
+
+			// For now, it will always be the case that Request==Allocation. If
+			// you would like to use a GPU you need to request the full GPU.
+			// Therefore max(usage,request) will always equal request. In the
+			// future this may need to be refactored when building support for
+			// GPU Time Slicing.
+			thisPod.Allocations[container].GPURequestAverage = res.Values[0].Value
 		}
 	}
 }