Kaynağa Gözat

Simplify GPU query (#2312)

This PR addresses an optimization in the GPURequest query by removing its reliance on the kube_pod_status_phase metric. Unlike CPURequest and RAMRequest, GPURequest uniquely utilizes kube_pod_status_phase. However, the rationale behind this specific usage is unclear, as it lacks documentation in the git history, and the original author has no recollection of the reasoning.

I suspect it's been done to filter out not running pods. I think node!="" filter in kube_pod_container_resource_requests query should be sufficient to select only pods allocated to nodes and aligns with all other queries.

It's the only place where kube_pod_status_phase metric is used. It's one of the most resource-intensive metrics. It produces 5 series per pod.
---------

Signed-off-by: r2k1 <yokree@gmail.com>
Co-authored-by: Ajay Tripathy <ajay@kubecost.com>
Co-authored-by: Matt Ray <github@mattray.dev>
Artur Khantimirov 2 yıl önce
ebeveyn
işleme
4aef88c1d3
1 değiştirilmiş dosya ile 10 ekleme ve 15 silme
  1. 10 15
      pkg/costmodel/costmodel.go

+ 10 - 15
pkg/costmodel/costmodel.go

@@ -146,9 +146,8 @@ const (
 	queryRAMRequestsStr = `avg(
 		label_replace(
 			label_replace(
-				avg(
-					sum_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", container!="",container!="POD", node!="", %s}[%s] %s)
-				) by (namespace,container,pod,node,%s) , "container_name","$1","container","(.+)"
+				sum_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", container!="",container!="POD", node!="", %s}[%s] %s)
+				, "container_name","$1","container","(.+)"
 			), "pod_name","$1","pod","(.+)"
 		)
 	) by (namespace,container_name,pod_name,node,%s)`
@@ -164,9 +163,8 @@ const (
 	queryCPURequestsStr = `avg(
 		label_replace(
 			label_replace(
-				avg(
-					sum_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="",container!="POD", node!="", %s}[%s] %s)
-				) by (namespace,container,pod,node,%s) , "container_name","$1","container","(.+)"
+				sum_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="",container!="POD", node!="", %s}[%s] %s)
+				, "container_name","$1","container","(.+)"
 			), "pod_name","$1","pod","(.+)"
 		)
 	) by (namespace,container_name,pod_name,node,%s)`
@@ -184,14 +182,11 @@ const (
 	queryGPURequestsStr = `avg(
 		label_replace(
 			label_replace(
-				avg(
-					sum_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s] %s)
-					* %f
-				) by (namespace,container,pod,node,%s) , "container_name","$1","container","(.+)"
+				sum_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s] %s), 
+				"container_name","$1","container","(.+)"
 			), "pod_name","$1","pod","(.+)"
 		)
-	) by (namespace,container_name,pod_name,node,%s)
-	* on (pod_name, namespace, %s) group_left(container) label_replace(avg(avg_over_time(kube_pod_status_phase{phase="Running", %s}[%s] %s)) by (pod,namespace,%s), "pod_name","$1","pod","(.+)")`
+	) by (namespace,container_name,pod_name,node,%s)`
 	queryPVRequestsStr = `avg(avg(kube_persistentvolumeclaim_info{volumename != "", %s}) by (persistentvolumeclaim, storageclass, namespace, volumename, %s, kubernetes_node)
 	*
 	on (persistentvolumeclaim, namespace, %s, kubernetes_node) group_right(storageclass, volumename)
@@ -1740,11 +1735,11 @@ func (cm *CostModel) costDataRange(cli prometheusClient.Client, cp costAnalyzerC
 
 	queryRAMAlloc := fmt.Sprintf(queryRAMAllocationByteHours, env.GetPromClusterFilter(), resStr, env.GetPromClusterLabel(), scrapeIntervalSeconds)
 	queryCPUAlloc := fmt.Sprintf(queryCPUAllocationVCPUHours, env.GetPromClusterFilter(), resStr, env.GetPromClusterLabel(), scrapeIntervalSeconds)
-	queryRAMRequests := fmt.Sprintf(queryRAMRequestsStr, env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel(), env.GetPromClusterLabel())
+	queryRAMRequests := fmt.Sprintf(queryRAMRequestsStr, env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
 	queryRAMUsage := fmt.Sprintf(queryRAMUsageStr, env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
-	queryCPURequests := fmt.Sprintf(queryCPURequestsStr, env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel(), env.GetPromClusterLabel())
+	queryCPURequests := fmt.Sprintf(queryCPURequestsStr, env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
 	queryCPUUsage := fmt.Sprintf(queryCPUUsageStr, env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
-	queryGPURequests := fmt.Sprintf(queryGPURequestsStr, env.GetPromClusterFilter(), resStr, "", resolution.Hours(), env.GetPromClusterLabel(), env.GetPromClusterLabel(), env.GetPromClusterLabel(), env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
+	queryGPURequests := fmt.Sprintf(queryGPURequestsStr, env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
 	queryPVRequests := fmt.Sprintf(queryPVRequestsStr, env.GetPromClusterFilter(), env.GetPromClusterLabel(), env.GetPromClusterLabel(), env.GetPromClusterFilter(), env.GetPromClusterLabel(), env.GetPromClusterLabel())
 	queryPVCAllocation := fmt.Sprintf(queryPVCAllocationFmt, env.GetPromClusterFilter(), resStr, env.GetPromClusterLabel(), scrapeIntervalSeconds)
 	queryPVHourlyCost := fmt.Sprintf(queryPVHourlyCostFmt, env.GetPromClusterFilter(), resStr)