Procházet zdrojové kódy

Bring GPURequestAverage and GPUEfficiency back into the Allocation API
response.

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

thomasvn před 2 roky
rodič
revize
2279782027

+ 15 - 0
core/pkg/opencost/allocation.go

@@ -952,6 +952,21 @@ func (a *Allocation) RAMEfficiency() float64 {
 	return 1.0
 }
 
+// GPUEfficiency is the ratio of usage to request. Note that, without the NVIDIA
+// DCGM exporter providing Prometheus with usage metrics, this will always be
+// zero, as GPUUsageAverage will be zero (the default value).
+func (a *Allocation) GPUEfficiency() float64 {
+	if a.GPURequestAverage > 0 && a.GPUUsageAverage > 0 {
+		return a.GPUUsageAverage / a.GPURequestAverage
+	}
+
+	if a.GPUUsageAverage == 0.0 || a.GPUCost == 0.0 {
+		return 0.0
+	}
+
+	return 1.0
+}
+
 // TotalEfficiency is the cost-weighted average of CPU and RAM efficiency. If
 // there is no cost at all, then efficiency is zero.
 func (a *Allocation) TotalEfficiency() float64 {

+ 2 - 0
core/pkg/opencost/allocation_json.go

@@ -30,6 +30,7 @@ type AllocationJSON struct {
 	GPUUsageAverage                *float64                        `json:"gpuUsageAverage"`
 	GPUCost                        *float64                        `json:"gpuCost"`
 	GPUCostAdjustment              *float64                        `json:"gpuCostAdjustment"`
+	GPUEfficiency                  *float64                        `json:"gpuEfficiency"`
 	NetworkTransferBytes           *float64                        `json:"networkTransferBytes"`
 	NetworkReceiveBytes            *float64                        `json:"networkReceiveBytes"`
 	NetworkCost                    *float64                        `json:"networkCost"`
@@ -84,6 +85,7 @@ func (aj *AllocationJSON) BuildFromAllocation(a *Allocation) {
 	aj.GPUUsageAverage = formatFloat64ForResponse(a.GPUUsageAverage)
 	aj.GPUCost = formatFloat64ForResponse(a.GPUCost)
 	aj.GPUCostAdjustment = formatFloat64ForResponse(a.GPUCostAdjustment)
+	aj.GPUEfficiency = formatFloat64ForResponse(a.GPUEfficiency())
 	aj.NetworkTransferBytes = formatFloat64ForResponse(a.NetworkTransferBytes)
 	aj.NetworkReceiveBytes = formatFloat64ForResponse(a.NetworkReceiveBytes)
 	aj.NetworkCost = formatFloat64ForResponse(a.NetworkCost)

+ 1 - 0
pkg/costmodel/allocation.go

@@ -430,6 +430,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	resChGPUsRequested := ctx.QueryAtTime(queryGPUsRequested, end)
 
 	queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, env.GetPromClusterLabel())
+	log.Infof("THOMAS: queryGPUsUsageAvg: %s", queryGPUsUsageAvg)
 	resChGPUsUsageAvg := ctx.Query(queryGPUsUsageAvg)
 
 	queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())

+ 31 - 17
pkg/costmodel/allocation_helpers.go

@@ -617,32 +617,45 @@ func applyRAMBytesUsedMax(podMap map[podKey]*pod, resRAMBytesUsedMax []*prom.Que
 // Example PromQueryResult: DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-d63da75b-f5a3-de2e-67bd-a72803ded604",device="nvidia0",modelName="Tesla V100-SXM2-16GB",Hostname="ip-192-168-5-95.us-east-2.compute.internal",DCGM_FI_DRIVER_VERSION="535.161.08",container="dcgmproftester12",namespace="kubecost",pod="dcgmproftester2"} 99
 func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
 	for _, res := range resGPUUsageAvg {
-		// TODO: The returned metric does not have an associated `cluster_id`? Does this need to be appended via scrapeconfig?
+
+		log.Infof("THOMAS: applyGPUUsageAvg result: %v: %v", res.Metric, res.Values[0].Value)
+
 		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
 		if err != nil {
 			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg result missing field: %s", err)
 			continue
 		}
 
-		// TODO: Do we also need to check the podUIDKeyMap?
-		pod, ok := podMap[key]
-		if !ok {
-			continue
+		var pods []*pod
+		if thisPod, ok := podMap[key]; !ok {
+			if uidKeys, ok := podUIDKeyMap[key]; ok {
+				for _, uidKey := range uidKeys {
+					thisPod, ok = podMap[uidKey]
+					if ok {
+						pods = append(pods, thisPod)
+					}
+				}
+			} else {
+				continue
+			}
+		} else {
+			pods = []*pod{thisPod}
 		}
 
-		// Add the container to the pod if it doesn't exist
-		container, err := res.GetString("container")
-		if err != nil {
-			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
-			continue
-		}
-		if _, ok := pod.Allocations[container]; !ok {
-			pod.appendContainer(container)
-		}
+		for _, thisPod := range pods {
+			container, err := res.GetString("container")
+			if err != nil {
+				log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
+				continue
+			}
+			if _, ok := thisPod.Allocations[container]; !ok {
+				thisPod.appendContainer(container)
+			}
 
-		// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
-		// percentage so it is consistent with other efficiency fields.
-		pod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
+			// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
+			// percentage so it is consistent with other fields.
+			thisPod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
+		}
 	}
 }
 
@@ -687,6 +700,7 @@ func applyGPUsAllocated(podMap map[podKey]*pod, resGPUsRequested []*prom.QueryRe
 
 			hrs := thisPod.Allocations[container].Minutes() / 60.0
 			thisPod.Allocations[container].GPUHours = res.Values[0].Value * hrs
+			thisPod.Allocations[container].GPURequestAverage = res.Values[0].Value
 		}
 	}
 }