Prechádzať zdrojové kódy

read and use gpu count

Sean Holcomb 5 rokov pred
rodič
commit
0c81c7a798

+ 1 - 0
PROMETHEUS.md

@@ -60,6 +60,7 @@ sum(node_total_hourly_cost) * 730
 | kubecost_load_balancer_cost   | Hourly cost of a load balancer                 |
 | kubecost_cluster_management_cost | Hourly management fee per cluster                 |
 | pv_hourly_cost   | Hourly cost per GP on a persistent volume                 |
+| node_gpu_count | Number of GPUs available on node |
 | container_cpu_allocation   | Average number of CPUs requested/used over last 1m                      |
 | container_gpu_allocation   | Average number of GPUs requested over last 1m                      |
 | container_memory_allocation_bytes   | Average bytes of RAM requested/used over last 1m                 |

+ 7 - 2
pkg/costmodel/cluster.go

@@ -389,6 +389,7 @@ type Node struct {
 	CPUCost         float64
 	CPUCores        float64
 	GPUCost         float64
+	GPUCount        float64
 	RAMCost         float64
 	RAMBytes        float64
 	Discount        float64
@@ -449,7 +450,8 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	queryNodeCPUCores := fmt.Sprintf(`avg_over_time(avg(kube_node_status_capacity_cpu_cores) by (cluster_id, node)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
 	queryNodeRAMCost := fmt.Sprintf(`sum_over_time((avg(kube_node_status_capacity_memory_bytes) by (cluster_id, node) * on(cluster_id, node) group_right avg(node_ram_hourly_cost) by (cluster_id, node, instance_type, provider_id))[%s:%dm]%s) / 1024 / 1024 / 1024 * %f`, durationStr, minsPerResolution, offsetStr, hourlyToCumulative)
 	queryNodeRAMBytes := fmt.Sprintf(`avg_over_time(avg(kube_node_status_capacity_memory_bytes) by (cluster_id, node)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
-	queryNodeGPUCost := fmt.Sprintf(`sum_over_time((avg(node_gpu_hourly_cost * %d.0 / 60.0) by (cluster_id, node, provider_id))[%s:%dm]%s)`, minsPerResolution, durationStr, minsPerResolution, offsetStr)
+	queryNodeGPUCost := fmt.Sprintf(`sum_over_time((avg(node_gpu_count) by (cluster_id, node) * on(node, cluster_id) group_right avg(node_gpu_hourly_cost) by (cluster_id, node, instance_type, provider_id))[%s:%dm]%s) * %f`, durationStr, minsPerResolution, offsetStr, hourlyToCumulative)
+	queryNodeGPUCount := fmt.Sprintf(`avg_over_time(avg(node_gpu_count) by (cluster_id, node)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
 	queryNodeCPUModeTotal := fmt.Sprintf(`sum(rate(node_cpu_seconds_total[%s:%dm]%s)) by (kubernetes_node, cluster_id, mode)`, durationStr, minsPerResolution, offsetStr)
 	queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, cluster_id), "instance", "$1", "node", "(.*)")) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
 	queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, cluster_id), "instance", "$1", "node", "(.*)")) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
@@ -463,6 +465,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	resChNodeRAMCost := requiredCtx.Query(queryNodeRAMCost)
 	resChNodeRAMBytes := requiredCtx.Query(queryNodeRAMBytes)
 	resChNodeGPUCost := requiredCtx.Query(queryNodeGPUCost)
+	resChNodeGPUCount := requiredCtx.Query(queryNodeGPUCount)
 	resChActiveMins := requiredCtx.Query(queryActiveMins)
 	resChIsSpot := requiredCtx.Query(queryIsSpot)
 
@@ -475,6 +478,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	resNodeCPUCost, _ := resChNodeCPUCost.Await()
 	resNodeCPUCores, _ := resChNodeCPUCores.Await()
 	resNodeGPUCost, _ := resChNodeGPUCost.Await()
+	resNodeGPUCount, _ := resChNodeGPUCount.Await()
 	resNodeRAMCost, _ := resChNodeRAMCost.Await()
 	resNodeRAMBytes, _ := resChNodeRAMBytes.Await()
 	resIsSpot, _ := resChIsSpot.Await()
@@ -506,6 +510,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 
 	cpuCoresMap := buildCPUCoresMap(resNodeCPUCores, clusterAndNameToType)
 
+	gpuCountMap := buildGPUCountMap(resNodeGPUCount)
 	ramBytesMap := buildRAMBytesMap(resNodeRAMBytes)
 
 	ramUserPctMap := buildRAMUserPctMap(resNodeRAMUserPct)
@@ -518,7 +523,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 
 	nodeMap := buildNodeMap(
 		cpuCostMap, ramCostMap, gpuCostMap,
-		cpuCoresMap, ramBytesMap, ramUserPctMap,
+		cpuCoresMap, ramBytesMap, ramUserPctMap, gpuCountMap,
 		ramSystemPctMap,
 		cpuBreakdownMap,
 		activeDataMap,

+ 37 - 1
pkg/costmodel/cluster_helpers.go

@@ -162,6 +162,38 @@ func buildGPUCostMap(
 	return gpuCostMap, clusterAndNameToType
 }
 
+func buildGPUCountMap(
+	resNodeGPUCount []*prom.QueryResult,
+) (
+	map[nodeIdentifierNoProviderID]float64,
+) {
+
+	gpuCountMap := make(map[nodeIdentifierNoProviderID]float64)
+
+	for _, result := range resNodeGPUCount {
+		cluster, err := result.GetString("cluster_id")
+		if err != nil {
+			cluster = env.GetClusterID()
+		}
+
+		name, err := result.GetString("node")
+		if err != nil {
+			log.Warningf("ClusterNodes: GPU count data missing node")
+			continue
+		}
+
+		gpuCost := result.Values[0].Value
+
+		key := nodeIdentifierNoProviderID{
+			Cluster: cluster,
+			Name:    name,
+		}
+		gpuCountMap[key] = gpuCost
+	}
+
+	return gpuCountMap
+}
+
 func buildCPUCoresMap(
 	resNodeCPUCores []*prom.QueryResult,
 	clusterAndNameToType map[nodeIdentifierNoProviderID]string,
@@ -573,7 +605,7 @@ func checkForKeyAndInitIfMissing(
 // complexity.
 func buildNodeMap(
 	cpuCostMap, ramCostMap, gpuCostMap map[NodeIdentifier]float64,
-	cpuCoresMap, ramBytesMap, ramUserPctMap,
+	cpuCoresMap, ramBytesMap, ramUserPctMap, gpuCountMap,
 	ramSystemPctMap map[nodeIdentifierNoProviderID]float64,
 	cpuBreakdownMap map[nodeIdentifierNoProviderID]*ClusterCostsBreakdown,
 	activeDataMap map[NodeIdentifier]activeData,
@@ -633,6 +665,10 @@ func buildNodeMap(
 			}
 		}
 
+		if GPUs, ok := gpuCountMap[clusterAndNameID]; ok {
+			nodePtr.GPUCount = GPUs
+		}
+
 		if ramBytes, ok := ramBytesMap[clusterAndNameID]; ok {
 			nodePtr.RAMBytes = ramBytes
 		}

+ 15 - 1
pkg/costmodel/cluster_helpers_test.go

@@ -133,6 +133,7 @@ func TestBuildNodeMap(t *testing.T) {
 		ramCostMap           map[NodeIdentifier]float64
 		gpuCostMap           map[NodeIdentifier]float64
 		cpuCoresMap          map[nodeIdentifierNoProviderID]float64
+		gpuCountMap          map[nodeIdentifierNoProviderID]float64
 		ramBytesMap          map[nodeIdentifierNoProviderID]float64
 		ramUserPctMap        map[nodeIdentifierNoProviderID]float64
 		ramSystemPctMap      map[nodeIdentifierNoProviderID]float64
@@ -318,6 +319,16 @@ func TestBuildNodeMap(t *testing.T) {
 					Name:    "node2",
 				}: 5.0,
 			},
+			gpuCountMap: map[nodeIdentifierNoProviderID]float64{
+				nodeIdentifierNoProviderID{
+					Cluster: "cluster1",
+					Name:    "node1",
+				}: 1.0,
+				nodeIdentifierNoProviderID{
+					Cluster: "cluster1",
+					Name:    "node2",
+				}: 2.0,
+			},
 			ramBytesMap: map[nodeIdentifierNoProviderID]float64{
 				nodeIdentifierNoProviderID{
 					Cluster: "cluster1",
@@ -450,6 +461,7 @@ func TestBuildNodeMap(t *testing.T) {
 					RAMCost:    0.09,
 					GPUCost:    0.8,
 					CPUCores:   2.0,
+					GPUCount: 1.0,
 					RAMBytes:   2048.0,
 					RAMBreakdown: &ClusterCostsBreakdown{
 						User:   30.0,
@@ -481,6 +493,7 @@ func TestBuildNodeMap(t *testing.T) {
 					RAMCost:    0.3,
 					GPUCost:    1.4,
 					CPUCores:   2.0,
+					GPUCount:   1.0,
 					RAMBytes:   2048.0,
 					RAMBreakdown: &ClusterCostsBreakdown{
 						User:   30.0,
@@ -512,6 +525,7 @@ func TestBuildNodeMap(t *testing.T) {
 					RAMCost:    0.024,
 					GPUCost:    3.1,
 					CPUCores:   5.0,
+					GPUCount:   2.0,
 					RAMBytes:   6303.0,
 					RAMBreakdown: &ClusterCostsBreakdown{
 						User:   42.6,
@@ -652,7 +666,7 @@ func TestBuildNodeMap(t *testing.T) {
 
 		result := buildNodeMap(
 			testCase.cpuCostMap, testCase.ramCostMap, testCase.gpuCostMap,
-			testCase.cpuCoresMap, testCase.ramBytesMap, testCase.ramUserPctMap,
+			testCase.cpuCoresMap, testCase.ramBytesMap, testCase.ramUserPctMap, testCase.gpuCountMap,
 			testCase.ramSystemPctMap,
 			testCase.cpuBreakdownMap,
 			testCase.activeDataMap,