Переглянути джерело

Initial progress on gpu-efficiency. Still needs more troubleshooting &
testing.

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

thomasvn 2 роки тому
батько
коміт
a92a5b3aee

+ 10 - 0
core/pkg/opencost/allocation.go

@@ -64,6 +64,8 @@ type Allocation struct {
 	CPUCost                    float64               `json:"cpuCost"`
 	CPUCostAdjustment          float64               `json:"cpuCostAdjustment"`
 	GPUHours                   float64               `json:"gpuHours"`
+	GPURequestAverage          float64               `json:"gpuRequestAverage"`
+	GPUUsageAverage            float64               `json:"gpuUsageAverage"`
 	GPUCost                    float64               `json:"gpuCost"`
 	GPUCostAdjustment          float64               `json:"gpuCostAdjustment"`
 	NetworkTransferBytes       float64               `json:"networkTransferBytes"`
@@ -669,6 +671,8 @@ func (a *Allocation) Clone() *Allocation {
 		CPUCost:                        a.CPUCost,
 		CPUCostAdjustment:              a.CPUCostAdjustment,
 		GPUHours:                       a.GPUHours,
+		GPURequestAverage:              a.GPURequestAverage,
+		GPUUsageAverage:                a.GPUUsageAverage,
 		GPUCost:                        a.GPUCost,
 		GPUCostAdjustment:              a.GPUCostAdjustment,
 		NetworkTransferBytes:           a.NetworkTransferBytes,
@@ -733,6 +737,12 @@ func (a *Allocation) Equal(that *Allocation) bool {
 	if !util.IsApproximately(a.GPUHours, that.GPUHours) {
 		return false
 	}
+	if !util.IsApproximately(a.GPURequestAverage, that.GPURequestAverage) {
+		return false
+	}
+	if !util.IsApproximately(a.GPUUsageAverage, that.GPUUsageAverage) {
+		return false
+	}
 	if !util.IsApproximately(a.GPUCost, that.GPUCost) {
 		return false
 	}

+ 4 - 0
core/pkg/opencost/allocation_json.go

@@ -26,6 +26,8 @@ type AllocationJSON struct {
 	CPUEfficiency                  *float64                        `json:"cpuEfficiency"`
 	GPUCount                       *float64                        `json:"gpuCount"`
 	GPUHours                       *float64                        `json:"gpuHours"`
+	GPURequestAverage              *float64                        `json:"gpuRequestAverage"`
+	GPUUsageAverage                *float64                        `json:"gpuUsageAverage"`
 	GPUCost                        *float64                        `json:"gpuCost"`
 	GPUCostAdjustment              *float64                        `json:"gpuCostAdjustment"`
 	NetworkTransferBytes           *float64                        `json:"networkTransferBytes"`
@@ -78,6 +80,8 @@ func (aj *AllocationJSON) BuildFromAllocation(a *Allocation) {
 	aj.CPUEfficiency = formatFloat64ForResponse(a.CPUEfficiency())
 	aj.GPUCount = formatFloat64ForResponse(a.GPUs())
 	aj.GPUHours = formatFloat64ForResponse(a.GPUHours)
+	aj.GPURequestAverage = formatFloat64ForResponse(a.GPURequestAverage)
+	aj.GPUUsageAverage = formatFloat64ForResponse(a.GPUUsageAverage)
 	aj.GPUCost = formatFloat64ForResponse(a.GPUCost)
 	aj.GPUCostAdjustment = formatFloat64ForResponse(a.GPUCostAdjustment)
 	aj.NetworkTransferBytes = formatFloat64ForResponse(a.NetworkTransferBytes)

+ 6 - 0
pkg/costmodel/allocation.go

@@ -28,6 +28,7 @@ const (
 	queryFmtCPURequests                 = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtCPUUsageAvg                 = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
 	queryFmtGPUsRequested               = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
+	queryFmtGPUsUsageAvg                = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s])) by (container, pod, namespace, %s)`
 	queryFmtGPUsAllocated               = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtNodeCostPerCPUHr            = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
 	queryFmtNodeCostPerRAMGiBHr         = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
@@ -428,6 +429,9 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
 	resChGPUsRequested := ctx.QueryAtTime(queryGPUsRequested, end)
 
+	queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, env.GetPromClusterLabel())
+	resChGPUsUsageAvg := ctx.Query(queryGPUsUsageAvg)
+
 	queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
 	resChGPUsAllocated := ctx.QueryAtTime(queryGPUsAllocated, end)
 
@@ -544,6 +548,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	resRAMUsageAvg, _ := resChRAMUsageAvg.Await()
 	resRAMUsageMax, _ := resChRAMUsageMax.Await()
 	resGPUsRequested, _ := resChGPUsRequested.Await()
+	resGPUsUsageAvg, _ := resChGPUsUsageAvg.Await()
 	resGPUsAllocated, _ := resChGPUsAllocated.Await()
 
 	resNodeCostPerCPUHr, _ := resChNodeCostPerCPUHr.Await()
@@ -610,6 +615,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	applyRAMBytesRequested(podMap, resRAMRequests, podUIDKeyMap)
 	applyRAMBytesUsedAvg(podMap, resRAMUsageAvg, podUIDKeyMap)
 	applyRAMBytesUsedMax(podMap, resRAMUsageMax, podUIDKeyMap)
+	applyGPUUsageAvg(podMap, resGPUsUsageAvg, podUIDKeyMap)
 	applyGPUsAllocated(podMap, resGPUsRequested, resGPUsAllocated, podUIDKeyMap)
 	applyNetworkTotals(podMap, resNetTransferBytes, resNetReceiveBytes, podUIDKeyMap)
 	applyNetworkAllocation(podMap, resNetZoneGiB, resNetZoneCostPerGiB, podUIDKeyMap, networkCrossZoneCost)

+ 32 - 0
pkg/costmodel/allocation_helpers.go

@@ -614,6 +614,38 @@ func applyRAMBytesUsedMax(podMap map[podKey]*pod, resRAMBytesUsedMax []*prom.Que
 	}
 }
 
+// Example PromQueryResult: DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-d63da75b-f5a3-de2e-67bd-a72803ded604",device="nvidia0",modelName="Tesla V100-SXM2-16GB",Hostname="ip-192-168-5-95.us-east-2.compute.internal",DCGM_FI_DRIVER_VERSION="535.161.08",container="dcgmproftester12",namespace="kubecost",pod="dcgmproftester2"} 99
+func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
+	for _, res := range resGPUUsageAvg {
+		// TODO: The returned metric does not have an associated `cluster_id`? Does this need to be appended via scrapeconfig?
+		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
+		if err != nil {
+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg result missing field: %s", err)
+			continue
+		}
+
+		// TODO: Do we also need to check the podUIDKeyMap?
+		pod, ok := podMap[key]
+		if !ok {
+			continue
+		}
+
+		// Add the container to the pod if it doesn't exist
+		container, err := res.GetString("container")
+		if err != nil {
+			log.DedupedWarningf(10, "CostModel.ComputeAllocation: GPU usage avg query result missing 'container': %s", key)
+			continue
+		}
+		if _, ok := pod.Allocations[container]; !ok {
+			pod.appendContainer(container)
+		}
+
+		// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
+		// percentage so it is consistent with other efficiency fields.
+		pod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
+	}
+}
+
 func applyGPUsAllocated(podMap map[podKey]*pod, resGPUsRequested []*prom.QueryResult, resGPUsAllocated []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
 	if len(resGPUsAllocated) > 0 { // Use the new query, when it's become available in a window
 		resGPUsRequested = resGPUsAllocated