há 1 ano atrás · 68d15105f9
--- a/pkg/costmodel/allocation.go
+++ b/pkg/costmodel/allocation.go
@@ -28,7 +28,7 @@ const (
 
				 	queryFmtCPURequests                 = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
			
 
				 	queryFmtCPUUsageAvg                 = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
			
 
				 	queryFmtGPUsRequested               = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
			
 
				-	queryFmtGPUsUsageAvg                = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s])) by (container, pod, namespace, %s)`
			
 
				+	queryFmtGPUsUsageAvg                = `avg(avg_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{container!=""}[%s])) by (container, pod, namespace, %s)`
			
 
				 	queryFmtGPUsAllocated               = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
			
 
				 	queryFmtNodeCostPerCPUHr            = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
			
 
				 	queryFmtNodeCostPerRAMGiBHr         = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
			
--- a/pkg/costmodel/allocation_helpers.go
+++ b/pkg/costmodel/allocation_helpers.go
@@ -615,7 +615,7 @@ func applyRAMBytesUsedMax(podMap map[podKey]*pod, resRAMBytesUsedMax []*prom.Que
 
				 }
			
 
				 
			
 
				 func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey) {
			
 
				-	// Example PromQueryResult: {container="dcgmproftester12", namespace="gpu", pod="dcgmproftester3-deployment-fc89c8dd6-ph7z5"} 99
			
 
				+	// Example PromQueryResult: {container="dcgmproftester12", namespace="gpu", pod="dcgmproftester3-deployment-fc89c8dd6-ph7z5"} 0.997307
			
 
				 	for _, res := range resGPUUsageAvg {
			
 
				 		key, err := resultPodKey(res, env.GetPromClusterLabel(), "namespace")
			
 
				 		if err != nil {
			
@@ -649,9 +649,8 @@ func applyGPUUsageAvg(podMap map[podKey]*pod, resGPUUsageAvg []*prom.QueryResult
 
				 				thisPod.appendContainer(container)
			
 
				 			}
			
 
				 
			
 
				-			// DCGM_FI_DEV_GPU_UTIL metric is a number 0-100. Scale down to a
			
 
				-			// percentage so it is consistent with other fields.
			
 
				-			thisPod.Allocations[container].GPUUsageAverage = res.Values[0].Value * 0.01
			
 
				+			// DCGM_FI_PROF_GR_ENGINE_ACTIVE metric is a float between 0-1.
			
 
				+			thisPod.Allocations[container].GPUUsageAverage = res.Values[0].Value
			
 
				 		}
			
 
				 	}
			
 
				 }