2 vuotta sitten · a92a5b3aee
--- a/core/pkg/opencost/allocation.go
+++ b/core/pkg/opencost/allocation.go
@@ -64,6 +64,8 @@ type Allocation struct {
 
				 	CPUCost                    float64               `json:"cpuCost"`
			
 
				 	CPUCostAdjustment          float64               `json:"cpuCostAdjustment"`
			
 
				 	GPUHours                   float64               `json:"gpuHours"`
			
 
				+	GPURequestAverage          float64               `json:"gpuRequestAverage"`
			
 
				+	GPUUsageAverage            float64               `json:"gpuUsageAverage"`
			
 
				 	GPUCost                    float64               `json:"gpuCost"`
			
 
				 	GPUCostAdjustment          float64               `json:"gpuCostAdjustment"`
			
 
				 	NetworkTransferBytes       float64               `json:"networkTransferBytes"`
			
@@ -669,6 +671,8 @@ func (a *Allocation) Clone() *Allocation {
 
				 		CPUCost:                        a.CPUCost,
			
 
				 		CPUCostAdjustment:              a.CPUCostAdjustment,
			
 
				 		GPUHours:                       a.GPUHours,
			
 
				+		GPURequestAverage:              a.GPURequestAverage,
			
 
				+		GPUUsageAverage:                a.GPUUsageAverage,
			
 
				 		GPUCost:                        a.GPUCost,
			
 
				 		GPUCostAdjustment:              a.GPUCostAdjustment,
			
 
				 		NetworkTransferBytes:           a.NetworkTransferBytes,
			
@@ -733,6 +737,12 @@ func (a *Allocation) Equal(that *Allocation) bool {
 
				 	if !util.IsApproximately(a.GPUHours, that.GPUHours) {
			
 
				 		return false
			
 
				 	}
			
 
				+	if !util.IsApproximately(a.GPURequestAverage, that.GPURequestAverage) {
			
 
				+		return false
			
 
				+	}
			
 
				+	if !util.IsApproximately(a.GPUUsageAverage, that.GPUUsageAverage) {
			
 
				+		return false
			
 
				+	}
			
 
				 	if !util.IsApproximately(a.GPUCost, that.GPUCost) {
			
 
				 		return false
			
 
				 	}
			
--- a/core/pkg/opencost/allocation_json.go
+++ b/core/pkg/opencost/allocation_json.go
@@ -26,6 +26,8 @@ type AllocationJSON struct {
 
				 	CPUEfficiency                  *float64                        `json:"cpuEfficiency"`
			
 
				 	GPUCount                       *float64                        `json:"gpuCount"`
			
 
				 	GPUHours                       *float64                        `json:"gpuHours"`
			
 
				+	GPURequestAverage              *float64                        `json:"gpuRequestAverage"`
			
 
				+	GPUUsageAverage                *float64                        `json:"gpuUsageAverage"`
			
 
				 	GPUCost                        *float64                        `json:"gpuCost"`
			
 
				 	GPUCostAdjustment              *float64                        `json:"gpuCostAdjustment"`
			
 
				 	NetworkTransferBytes           *float64                        `json:"networkTransferBytes"`
			
@@ -78,6 +80,8 @@ func (aj *AllocationJSON) BuildFromAllocation(a *Allocation) {
 
				 	aj.CPUEfficiency = formatFloat64ForResponse(a.CPUEfficiency())
			
 
				 	aj.GPUCount = formatFloat64ForResponse(a.GPUs())
			
 
				 	aj.GPUHours = formatFloat64ForResponse(a.GPUHours)
			
 
				+	aj.GPURequestAverage = formatFloat64ForResponse(a.GPURequestAverage)
			
 
				+	aj.GPUUsageAverage = formatFloat64ForResponse(a.GPUUsageAverage)
			
 
				 	aj.GPUCost = formatFloat64ForResponse(a.GPUCost)
			
 
				 	aj.GPUCostAdjustment = formatFloat64ForResponse(a.GPUCostAdjustment)
			
 
				 	aj.NetworkTransferBytes = formatFloat64ForResponse(a.NetworkTransferBytes)
			
--- a/pkg/costmodel/allocation.go
+++ b/pkg/costmodel/allocation.go
@@ -28,6 +28,7 @@ const (
 
				 	queryFmtCPURequests                 = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
			
 
				 	queryFmtCPUUsageAvg                 = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
			
 
				 	queryFmtGPUsRequested               = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
			
 
				+	queryFmtGPUsUsageAvg                = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s])) by (container, pod, namespace, %s)`
			
 
				 	queryFmtGPUsAllocated               = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
			
 
				 	queryFmtNodeCostPerCPUHr            = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
			
 
				 	queryFmtNodeCostPerRAMGiBHr         = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
			
@@ -428,6 +429,9 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 
				 	queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
			
 
				 	resChGPUsRequested := ctx.QueryAtTime(queryGPUsRequested, end)
			
 
				 
			
 
				+	queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, env.GetPromClusterLabel())
			
 
				+	resChGPUsUsageAvg := ctx.Query(queryGPUsUsageAvg)
			
 
				+
			
 
				 	queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
			
 
				 	resChGPUsAllocated := ctx.QueryAtTime(queryGPUsAllocated, end)
			
 
				 
			
@@ -544,6 +548,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 
				 	resRAMUsageAvg, _ := resChRAMUsageAvg.Await()
			
 
				 	resRAMUsageMax, _ := resChRAMUsageMax.Await()
			
 
				 	resGPUsRequested, _ := resChGPUsRequested.Await()
			
 
				+	resGPUsUsageAvg, _ := resChGPUsUsageAvg.Await()
			
 
				 	resGPUsAllocated, _ := resChGPUsAllocated.Await()
			
 
				 
			
 
				 	resNodeCostPerCPUHr, _ := resChNodeCostPerCPUHr.Await()
			
@@ -610,6 +615,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 
				 	applyRAMBytesRequested(podMap, resRAMRequests, podUIDKeyMap)
			
 
				 	applyRAMBytesUsedAvg(podMap, resRAMUsageAvg, podUIDKeyMap)
			
 
				 	applyRAMBytesUsedMax(podMap, resRAMUsageMax, podUIDKeyMap)
			
 
				+	applyGPUUsageAvg(podMap, resGPUsUsageAvg, podUIDKeyMap)
			
 
				 	applyGPUsAllocated(podMap, resGPUsRequested, resGPUsAllocated, podUIDKeyMap)
			
 
				 	applyNetworkTotals(podMap, resNetTransferBytes, resNetReceiveBytes, podUIDKeyMap)
			
 
				 	applyNetworkAllocation(podMap, resNetZoneGiB, resNetZoneCostPerGiB, podUIDKeyMap, networkCrossZoneCost)
			
--- a/pkg/costmodel/allocation_helpers.go
+++ b/pkg/costmodel/allocation_helpers.go
@@ -614,6 +614,38 @@ func applyRAMBytesUsedMax(podMap map[podKey]*pod, resRAMBytesUsedMax []*prom.Que
 
				 	}
			
 
				 }
			
 
				 
			
 
				+// Example PromQueryResult: DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-d63da75b-f5a3-de2e-67bd-a72803ded604",device="nvidia0",modelName="Tesla V100-SXM2-16GB",Hostname="ip-192-168-5-95.us-east-2.compute.internal",DCGM_FI_DRIVER_VERSION="535.161.08",container="dcgmproftester12",namespace="kubecost",pod="dcgmproftester2"} 99
			
 
				+func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
			
 
				+	for _, res := range resGPUUsageAvg {
			
 
				+		// TODO: The returned metric does not have an associated `cluster_id`? Does this need to be appended via scrapeconfig?
			
 
				+		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
			
 
				+		if err != nil {
			
 
				+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg result missing field: %s", err)
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		// TODO: Do we also need to check the podUIDKeyMap?
			
 
				+		pod, ok := podMap[key]
			
 
				+		if !ok {
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		// Add the container to the pod if it doesn't exist
			
 
				+		container, err := res.GetString("container")
			
 
				+		if err != nil {
			
 
				+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
			
 
				+			continue
			
 
				+		}
			
 
				+		if _, ok := pod.Allocations[container]; !ok {
			
 
				+			pod.appendContainer(container)
			
 
				+		}
			
 
				+
			
 
				+		// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
			
 
				+		// percentage so it is consistent with other efficiency fields.
			
 
				+		pod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 func applyGPUsAllocated(podMap map[podKey]*pod, resGPUsRequested []*prom.QueryResult, resGPUsAllocated []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
			
 
				 	if len(resGPUsAllocated) > 0 { // Use the new query, when it's become available in a window
			
 
				 		resGPUsRequested = resGPUsAllocated