před 2 roky · 2279782027
--- a/core/pkg/opencost/allocation.go
+++ b/core/pkg/opencost/allocation.go
@@ -952,6 +952,21 @@ func (a *Allocation) RAMEfficiency() float64 {
 
				 	return 1.0
			
 
				 }
			
 
				 
			
 
				+// GPUEfficiency is the ratio of usage to request. Note that, without the NVIDIA
			
 
				+// DCGM exporter providing Prometheus with usage metrics, this will always be
			
 
				+// zero, as GPUUsageAverage will be zero (the default value).
			
 
				+func (a *Allocation) GPUEfficiency() float64 {
			
 
				+	if a.GPURequestAverage > 0 && a.GPUUsageAverage > 0 {
			
 
				+		return a.GPUUsageAverage / a.GPURequestAverage
			
 
				+	}
			
 
				+
			
 
				+	if a.GPUUsageAverage == 0.0 || a.GPUCost == 0.0 {
			
 
				+		return 0.0
			
 
				+	}
			
 
				+
			
 
				+	return 1.0
			
 
				+}
			
 
				+
			
 
				 // TotalEfficiency is the cost-weighted average of CPU and RAM efficiency. If
			
 
				 // there is no cost at all, then efficiency is zero.
			
 
				 func (a *Allocation) TotalEfficiency() float64 {
			
--- a/core/pkg/opencost/allocation_json.go
+++ b/core/pkg/opencost/allocation_json.go
@@ -30,6 +30,7 @@ type AllocationJSON struct {
 
				 	GPUUsageAverage                *float64                        `json:"gpuUsageAverage"`
			
 
				 	GPUCost                        *float64                        `json:"gpuCost"`
			
 
				 	GPUCostAdjustment              *float64                        `json:"gpuCostAdjustment"`
			
 
				+	GPUEfficiency                  *float64                        `json:"gpuEfficiency"`
			
 
				 	NetworkTransferBytes           *float64                        `json:"networkTransferBytes"`
			
 
				 	NetworkReceiveBytes            *float64                        `json:"networkReceiveBytes"`
			
 
				 	NetworkCost                    *float64                        `json:"networkCost"`
			
@@ -84,6 +85,7 @@ func (aj *AllocationJSON) BuildFromAllocation(a *Allocation) {
 
				 	aj.GPUUsageAverage = formatFloat64ForResponse(a.GPUUsageAverage)
			
 
				 	aj.GPUCost = formatFloat64ForResponse(a.GPUCost)
			
 
				 	aj.GPUCostAdjustment = formatFloat64ForResponse(a.GPUCostAdjustment)
			
 
				+	aj.GPUEfficiency = formatFloat64ForResponse(a.GPUEfficiency())
			
 
				 	aj.NetworkTransferBytes = formatFloat64ForResponse(a.NetworkTransferBytes)
			
 
				 	aj.NetworkReceiveBytes = formatFloat64ForResponse(a.NetworkReceiveBytes)
			
 
				 	aj.NetworkCost = formatFloat64ForResponse(a.NetworkCost)
			
--- a/pkg/costmodel/allocation.go
+++ b/pkg/costmodel/allocation.go
@@ -430,6 +430,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 
				 	resChGPUsRequested := ctx.QueryAtTime(queryGPUsRequested, end)
			
 
				 
			
 
				 	queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, env.GetPromClusterLabel())
			
 
				+	log.Infof("THOMAS: queryGPUsUsageAvg: %s", queryGPUsUsageAvg)
			
 
				 	resChGPUsUsageAvg := ctx.Query(queryGPUsUsageAvg)
			
 
				 
			
 
				 	queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
			
--- a/pkg/costmodel/allocation_helpers.go
+++ b/pkg/costmodel/allocation_helpers.go
@@ -617,32 +617,45 @@ func applyRAMBytesUsedMax(podMap map[podKey]*pod, resRAMBytesUsedMax []*prom.Que
 
				 // Example PromQueryResult: DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-d63da75b-f5a3-de2e-67bd-a72803ded604",device="nvidia0",modelName="Tesla V100-SXM2-16GB",Hostname="ip-192-168-5-95.us-east-2.compute.internal",DCGM_FI_DRIVER_VERSION="535.161.08",container="dcgmproftester12",namespace="kubecost",pod="dcgmproftester2"} 99
			
 
				 func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
			
 
				 	for _, res := range resGPUUsageAvg {
			
 
				-		// TODO: The returned metric does not have an associated `cluster_id`? Does this need to be appended via scrapeconfig?
			
 
				+
			
 
				+		log.Infof("THOMAS: applyGPUUsageAvg result: %v: %v", res.Metric, res.Values[0].Value)
			
 
				+
			
 
				 		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
			
 
				 		if err != nil {
			
 
				 			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg result missing field: %s", err)
			
 
				 			continue
			
 
				 		}
			
 
				 
			
 
				-		// TODO: Do we also need to check the podUIDKeyMap?
			
 
				-		pod, ok := podMap[key]
			
 
				-		if !ok {
			
 
				-			continue
			
 
				+		var pods []*pod
			
 
				+		if thisPod, ok := podMap[key]; !ok {
			
 
				+			if uidKeys, ok := podUIDKeyMap[key]; ok {
			
 
				+				for _, uidKey := range uidKeys {
			
 
				+					thisPod, ok = podMap[uidKey]
			
 
				+					if ok {
			
 
				+						pods = append(pods, thisPod)
			
 
				+					}
			
 
				+				}
			
 
				+			} else {
			
 
				+				continue
			
 
				+			}
			
 
				+		} else {
			
 
				+			pods = []*pod{thisPod}
			
 
				 		}
			
 
				 
			
 
				-		// Add the container to the pod if it doesn't exist
			
 
				-		container, err := res.GetString("container")
			
 
				-		if err != nil {
			
 
				-			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
			
 
				-			continue
			
 
				-		}
			
 
				-		if _, ok := pod.Allocations[container]; !ok {
			
 
				-			pod.appendContainer(container)
			
 
				-		}
			
 
				+		for _, thisPod := range pods {
			
 
				+			container, err := res.GetString("container")
			
 
				+			if err != nil {
			
 
				+				log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
			
 
				+				continue
			
 
				+			}
			
 
				+			if _, ok := thisPod.Allocations[container]; !ok {
			
 
				+				thisPod.appendContainer(container)
			
 
				+			}
			
 
				 
			
 
				-		// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
			
 
				-		// percentage so it is consistent with other efficiency fields.
			
 
				-		pod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
			
 
				+			// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
			
 
				+			// percentage so it is consistent with other fields.
			
 
				+			thisPod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -687,6 +700,7 @@ func applyGPUsAllocated(podMap map[podKey]*pod, resGPUsRequested []*prom.QueryRe
 
				 
			
 
				 			hrs := thisPod.Allocations[container].Minutes() / 60.0
			
 
				 			thisPod.Allocations[container].GPUHours = res.Values[0].Value * hrs
			
 
				+			thisPod.Allocations[container].GPURequestAverage = res.Values[0].Value
			
 
				 		}
			
 
				 	}
			
 
				 }