Przeglądaj źródła

ClusterNodes: RAM and CPU breakdowns

Niko Kovacevic 5 lat temu
rodzic
commit
3574e4e687
1 zmienionych plików z 135 dodań i 27 usunięć
  1. 135 27
      pkg/costmodel/cluster.go

+ 135 - 27
pkg/costmodel/cluster.go

@@ -260,17 +260,19 @@ func ClusterDisks(client prometheus.Client, provider cloud.Provider, duration, o
 }
 
 type Node struct {
-	Cluster     string
-	Name        string
-	ProviderID  string
-	NodeType    string
-	CPUCost     float64
-	CPUCores    float64
-	GPUCost     float64
-	RAMCost     float64
-	RAMBytes    float64
-	Discount    float64
-	Preemptible bool
+	Cluster      string
+	Name         string
+	ProviderID   string
+	NodeType     string
+	CPUCost      float64
+	CPUCores     float64
+	GPUCost      float64
+	RAMCost      float64
+	RAMBytes     float64
+	Discount     float64
+	Preemptible  bool
+	CPUBreakdown *ClusterCostsBreakdown
+	RAMBreakdown *ClusterCostsBreakdown
 }
 
 func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset time.Duration) (map[string]*Node, []error) {
@@ -297,6 +299,9 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	queryNodeRAMBytes := fmt.Sprintf(`avg_over_time(avg(kube_node_status_capacity_memory_bytes) by (cluster_id, node)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
 	queryNodeGPUCost := fmt.Sprintf(`sum_over_time((avg(node_gpu_hourly_cost) by (cluster_id, node, provider_id))[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
 	queryNodeLabels := fmt.Sprintf(`count_over_time(kube_node_labels[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
+	queryNodeCPUModePct := fmt.Sprintf(`sum(rate(node_cpu_seconds_total[%s:%dm]%s)) by (kubernetes_node, cluster_id, mode) / ignoring(mode) group_left sum(rate(node_cpu_seconds_total[%s:%dm]%s)) by (kubernetes_node, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
+	queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_usage_bytes{container_name!="",namespace="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / sum(sum_over_time(label_replace(kube_node_status_capacity_memory_bytes, "instance", "$1", "kubernetes_node", "(.*)")[%s:%dm]%s)) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
+	queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!=""}[%s:%dm]%s)) by (instance, cluster_id) / sum(sum_over_time(label_replace(kube_node_status_capacity_memory_bytes, "instance", "$1", "kubernetes_node", "(.*)")[%s:%dm]%s)) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
 
 	resChNodeCPUCost := ctx.Query(queryNodeCPUCost)
 	resChNodeCPUCores := ctx.Query(queryNodeCPUCores)
@@ -304,6 +309,9 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	resChNodeRAMBytes := ctx.Query(queryNodeRAMBytes)
 	resChNodeGPUCost := ctx.Query(queryNodeGPUCost)
 	resChNodeLabels := ctx.Query(queryNodeLabels)
+	resChNodeCPUModePct := ctx.Query(queryNodeCPUModePct)
+	resChNodeRAMSystemPct := ctx.Query(queryNodeRAMSystemPct)
+	resChNodeRAMUserPct := ctx.Query(queryNodeRAMUserPct)
 
 	resNodeCPUCost, _ := resChNodeCPUCost.Await()
 	resNodeCPUCores, _ := resChNodeCPUCores.Await()
@@ -311,6 +319,9 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	resNodeRAMCost, _ := resChNodeRAMCost.Await()
 	resNodeRAMBytes, _ := resChNodeRAMBytes.Await()
 	resNodeLabels, _ := resChNodeLabels.Await()
+	resNodeCPUModePct, _ := resChNodeCPUModePct.Await()
+	resNodeRAMSystemPct, _ := resChNodeRAMSystemPct.Await()
+	resNodeRAMUserPct, _ := resChNodeRAMUserPct.Await()
 	if ctx.ErrorCollector.IsError() {
 		return nil, ctx.Errors()
 	}
@@ -337,10 +348,12 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 		key := fmt.Sprintf("%s/%s", cluster, name)
 		if _, ok := nodeMap[key]; !ok {
 			nodeMap[key] = &Node{
-				Cluster:    cluster,
-				Name:       name,
-				NodeType:   nodeType,
-				ProviderID: cp.ParseID(providerID),
+				Cluster:      cluster,
+				Name:         name,
+				NodeType:     nodeType,
+				ProviderID:   cp.ParseID(providerID),
+				CPUBreakdown: &ClusterCostsBreakdown{},
+				RAMBreakdown: &ClusterCostsBreakdown{},
 			}
 		}
 		nodeMap[key].CPUCost += cpuCost
@@ -364,8 +377,10 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 		key := fmt.Sprintf("%s/%s", cluster, name)
 		if _, ok := nodeMap[key]; !ok {
 			nodeMap[key] = &Node{
-				Cluster: cluster,
-				Name:    name,
+				Cluster:      cluster,
+				Name:         name,
+				CPUBreakdown: &ClusterCostsBreakdown{},
+				RAMBreakdown: &ClusterCostsBreakdown{},
 			}
 		}
 		nodeMap[key].CPUCores = cpuCores
@@ -391,10 +406,12 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 		key := fmt.Sprintf("%s/%s", cluster, name)
 		if _, ok := nodeMap[key]; !ok {
 			nodeMap[key] = &Node{
-				Cluster:    cluster,
-				Name:       name,
-				NodeType:   nodeType,
-				ProviderID: cp.ParseID(providerID),
+				Cluster:      cluster,
+				Name:         name,
+				NodeType:     nodeType,
+				ProviderID:   cp.ParseID(providerID),
+				CPUBreakdown: &ClusterCostsBreakdown{},
+				RAMBreakdown: &ClusterCostsBreakdown{},
 			}
 		}
 		nodeMap[key].RAMCost += ramCost
@@ -418,8 +435,10 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 		key := fmt.Sprintf("%s/%s", cluster, name)
 		if _, ok := nodeMap[key]; !ok {
 			nodeMap[key] = &Node{
-				Cluster: cluster,
-				Name:    name,
+				Cluster:      cluster,
+				Name:         name,
+				CPUBreakdown: &ClusterCostsBreakdown{},
+				RAMBreakdown: &ClusterCostsBreakdown{},
 			}
 		}
 		nodeMap[key].RAMBytes = ramBytes
@@ -445,15 +464,101 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 		key := fmt.Sprintf("%s/%s", cluster, name)
 		if _, ok := nodeMap[key]; !ok {
 			nodeMap[key] = &Node{
-				Cluster:    cluster,
-				Name:       name,
-				NodeType:   nodeType,
-				ProviderID: cp.ParseID(providerID),
+				Cluster:      cluster,
+				Name:         name,
+				NodeType:     nodeType,
+				ProviderID:   cp.ParseID(providerID),
+				CPUBreakdown: &ClusterCostsBreakdown{},
+				RAMBreakdown: &ClusterCostsBreakdown{},
 			}
 		}
 		nodeMap[key].GPUCost += gpuCost
 	}
 
+	for _, result := range resNodeCPUModePct {
+		cluster, err := result.GetString("cluster_id")
+		if err != nil {
+			cluster = env.GetClusterID()
+		}
+
+		name, err := result.GetString("kubernetes_node")
+		if err != nil {
+			log.Warningf("ClusterNodes: CPU mode data missing node")
+			continue
+		}
+
+		mode, err := result.GetString("mode")
+		if err != nil {
+			log.Warningf("ClusterNodes: unable to read CPU mode: %s", err)
+			mode = "other"
+		}
+
+		pct := result.Values[0].Value
+
+		key := fmt.Sprintf("%s/%s", cluster, name)
+		if _, ok := nodeMap[key]; !ok {
+			log.Warningf("ClusterNodes: CPU mode data for unidentified node")
+			continue
+		}
+
+		switch mode {
+		case "idle":
+			nodeMap[key].CPUBreakdown.Idle += pct
+		case "system":
+			nodeMap[key].CPUBreakdown.System += pct
+		case "user":
+			nodeMap[key].CPUBreakdown.User += pct
+		default:
+			nodeMap[key].CPUBreakdown.Other += pct
+		}
+	}
+
+	for _, result := range resNodeRAMSystemPct {
+		cluster, err := result.GetString("cluster_id")
+		if err != nil {
+			cluster = env.GetClusterID()
+		}
+
+		name, err := result.GetString("instance")
+		if err != nil {
+			log.Warningf("ClusterNodes: RAM system percent missing node")
+			continue
+		}
+
+		pct := result.Values[0].Value
+
+		key := fmt.Sprintf("%s/%s", cluster, name)
+		if _, ok := nodeMap[key]; !ok {
+			log.Warningf("ClusterNodes: RAM system percent for unidentified node")
+			continue
+		}
+
+		nodeMap[key].RAMBreakdown.System += pct
+	}
+
+	for _, result := range resNodeRAMUserPct {
+		cluster, err := result.GetString("cluster_id")
+		if err != nil {
+			cluster = env.GetClusterID()
+		}
+
+		name, err := result.GetString("instance")
+		if err != nil {
+			log.Warningf("ClusterNodes: RAM system percent missing node")
+			continue
+		}
+
+		pct := result.Values[0].Value
+
+		key := fmt.Sprintf("%s/%s", cluster, name)
+		if _, ok := nodeMap[key]; !ok {
+			log.Warningf("ClusterNodes: RAM system percent for unidentified node")
+			continue
+		}
+
+		nodeMap[key].RAMBreakdown.User += pct
+	}
+
 	// Determine preemptibility with node labels
 	for _, result := range resNodeLabels {
 		nodeName, err := result.GetString("node")
@@ -490,6 +595,9 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	for _, node := range nodeMap {
 		// TODO take RI into account
 		node.Discount = cp.CombinedDiscountForNode(node.NodeType, node.Preemptible, discount, negotiatedDiscount)
+
+		// Apply all remaining RAM to Idle
+		node.RAMBreakdown.Idle = 1.0 - (node.RAMBreakdown.System + node.RAMBreakdown.Other + node.RAMBreakdown.User)
 	}
 
 	return nodeMap, nil