Ver Fonte

Swap `DCGM_FI_DEV_GPU_UTIL` for `DCGM_FI_PROF_GR_ENGINE_ACTIVE`. (#2853)

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>
Thomas Nguyen há 1 ano atrás
pai
commit
68d15105f9
2 ficheiros alterados com 4 adições e 5 exclusões
  1. 1 1
      pkg/costmodel/allocation.go
  2. 3 4
      pkg/costmodel/allocation_helpers.go

+ 1 - 1
pkg/costmodel/allocation.go

@@ -28,7 +28,7 @@ const (
 	queryFmtCPURequests                 = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtCPUUsageAvg                 = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
 	queryFmtGPUsRequested               = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
-	queryFmtGPUsUsageAvg                = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s])) by (container, pod, namespace, %s)`
+	queryFmtGPUsUsageAvg                = `avg(avg_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{container!=""}[%s])) by (container, pod, namespace, %s)`
 	queryFmtGPUsAllocated               = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtNodeCostPerCPUHr            = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
 	queryFmtNodeCostPerRAMGiBHr         = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`

+ 3 - 4
pkg/costmodel/allocation_helpers.go

@@ -615,7 +615,7 @@ func applyRAMBytesUsedMax(podMap map[podKey]*pod, resRAMBytesUsedMax []*prom.Que
 }
 
 func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
-	// Example PromQueryResult: {container="dcgmproftester12", namespace="gpu", pod="dcgmproftester3-deployment-fc89c8dd6-ph7z5"} 99
+	// Example PromQueryResult: {container="dcgmproftester12", namespace="gpu", pod="dcgmproftester3-deployment-fc89c8dd6-ph7z5"} 0.997307
 	for _, res := range resGPUUsageAvg {
 		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
 		if err != nil {
@@ -649,9 +649,8 @@ func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult
 				thisPod.appendContainer(container)
 			}
 
-			// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
-			// percentage so it is consistent with other fields.
-			thisPod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
+			// DCGM_FI_PROF_GR_ENGINE_ACTIVE metric is a float between 0-1.
+			thisPod.Allocations[container].GPUUsageAverage = res.Values[0].Value
 		}
 	}
 }