Bläddra i källkod

Move gpu cost multiplication outside of prom query

Sean Holcomb 5 år sedan
förälder
incheckning
6362fc6d29
3 ändrade filer med 81 tillägg och 54 borttagningar
  1. 9 4
      pkg/costmodel/cluster.go
  2. 26 14
      pkg/costmodel/cluster_helpers.go
  3. 46 36
      pkg/costmodel/cluster_helpers_test.go

+ 9 - 4
pkg/costmodel/cluster.go

@@ -452,6 +452,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	queryNodeRAMBytes := fmt.Sprintf(`avg_over_time(avg(kube_node_status_capacity_memory_bytes) by (cluster_id, node)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
 	queryNodeGPUCost := fmt.Sprintf(`sum_over_time((avg(node_gpu_count) by (cluster_id, node) * on(node, cluster_id) group_right avg(node_gpu_hourly_cost) by (cluster_id, node, instance_type, provider_id))[%s:%dm]%s) * %f`, durationStr, minsPerResolution, offsetStr, hourlyToCumulative)
 	queryNodeGPUCount := fmt.Sprintf(`avg_over_time(avg(node_gpu_count) by (cluster_id, node)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
+	queryNodeGPUHourlySum := fmt.Sprintf(`sum_over_time(avg(node_gpu_hourly_cost) by (cluster_id, node, instance_type, provider_id)[%s:%dm]%s) * %f`, durationStr, minsPerResolution, offsetStr, hourlyToCumulative)
 	queryNodeCPUModeTotal := fmt.Sprintf(`sum(rate(node_cpu_seconds_total[%s:%dm]%s)) by (kubernetes_node, cluster_id, mode)`, durationStr, minsPerResolution, offsetStr)
 	queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, cluster_id), "instance", "$1", "node", "(.*)")) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
 	queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, cluster_id), "instance", "$1", "node", "(.*)")) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
@@ -466,6 +467,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	resChNodeRAMBytes := requiredCtx.Query(queryNodeRAMBytes)
 	resChNodeGPUCost := requiredCtx.Query(queryNodeGPUCost)
 	resChNodeGPUCount := requiredCtx.Query(queryNodeGPUCount)
+	resChNodeGPUHourlySum := requiredCtx.Query(queryNodeGPUHourlySum)
 	resChActiveMins := requiredCtx.Query(queryActiveMins)
 	resChIsSpot := requiredCtx.Query(queryIsSpot)
 
@@ -479,6 +481,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	resNodeCPUCores, _ := resChNodeCPUCores.Await()
 	resNodeGPUCost, _ := resChNodeGPUCost.Await()
 	resNodeGPUCount, _ := resChNodeGPUCount.Await()
+	resNodeGPUHourlySum, _ := resChNodeGPUHourlySum.Await()
 	resNodeRAMCost, _ := resChNodeRAMCost.Await()
 	resNodeRAMBytes, _ := resChNodeRAMBytes.Await()
 	resIsSpot, _ := resChIsSpot.Await()
@@ -488,6 +491,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	resActiveMins, _ := resChActiveMins.Await()
 	resLabels, _ := resChLabels.Await()
 
+	log.Infof("%v", resNodeGPUCost)
 	if optionalCtx.HasErrors() {
 		for _, err := range optionalCtx.Errors() {
 			log.Warningf("ClusterNodes: %s", err)
@@ -501,16 +505,17 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 		return nil, requiredCtx.ErrorCollection()
 	}
 
+	gpuCountMap := buildGPUCountMap(resNodeGPUCount, cp.ParseID)
+
 	cpuCostMap, clusterAndNameToType1 := buildCPUCostMap(resNodeCPUCost, cp.ParseID)
 	ramCostMap, clusterAndNameToType2 := buildRAMCostMap(resNodeRAMCost, cp.ParseID)
-	gpuCostMap, clusterAndNameToType3 := buildGPUCostMap(resNodeGPUCost, cp.ParseID)
+	gpuCostMap, clusterAndNameToType3 := buildGPUCostMap(resNodeGPUHourlySum, gpuCountMap, cp.ParseID)
 
 	clusterAndNameToTypeIntermediate := mergeTypeMaps(clusterAndNameToType1, clusterAndNameToType2)
 	clusterAndNameToType := mergeTypeMaps(clusterAndNameToTypeIntermediate, clusterAndNameToType3)
 
 	cpuCoresMap := buildCPUCoresMap(resNodeCPUCores, clusterAndNameToType)
 
-	gpuCountMap := buildGPUCountMap(resNodeGPUCount)
 	ramBytesMap := buildRAMBytesMap(resNodeRAMBytes)
 
 	ramUserPctMap := buildRAMUserPctMap(resNodeRAMUserPct)
@@ -522,8 +527,8 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
 	labelsMap := buildLabelsMap(resLabels)
 
 	nodeMap := buildNodeMap(
-		cpuCostMap, ramCostMap, gpuCostMap,
-		cpuCoresMap, ramBytesMap, ramUserPctMap, gpuCountMap,
+		cpuCostMap, ramCostMap, gpuCostMap, gpuCountMap,
+		cpuCoresMap, ramBytesMap, ramUserPctMap,
 		ramSystemPctMap,
 		cpuBreakdownMap,
 		activeDataMap,

+ 26 - 14
pkg/costmodel/cluster_helpers.go

@@ -118,6 +118,7 @@ func buildRAMCostMap(
 
 func buildGPUCostMap(
 	resNodeGPUCost []*prom.QueryResult,
+	gpuCountMap map[NodeIdentifier]float64,
 	providerIDParser func(string) string,
 ) (
 	map[NodeIdentifier]float64,
@@ -156,7 +157,13 @@ func buildGPUCostMap(
 
 		clusterAndNameToType[keyNon] = nodeType
 
-		gpuCostMap[key] = gpuCost
+		// If gpu count is available use it to multiply gpu cost
+		if value, ok := gpuCountMap[key]; ok {
+			gpuCostMap[key] = gpuCost * value
+		} else {
+			gpuCostMap[key] = gpuCost
+		}
+
 	}
 
 	return gpuCostMap, clusterAndNameToType
@@ -164,11 +171,12 @@ func buildGPUCostMap(
 
 func buildGPUCountMap(
 	resNodeGPUCount []*prom.QueryResult,
+	providerIDParser func(string) string,
 ) (
-	map[nodeIdentifierNoProviderID]float64,
+	map[NodeIdentifier]float64,
 ) {
 
-	gpuCountMap := make(map[nodeIdentifierNoProviderID]float64)
+	gpuCountMap := make(map[NodeIdentifier]float64)
 
 	for _, result := range resNodeGPUCount {
 		cluster, err := result.GetString("cluster_id")
@@ -182,13 +190,16 @@ func buildGPUCountMap(
 			continue
 		}
 
-		gpuCost := result.Values[0].Value
+		gpuCount := result.Values[0].Value
+		providerID, _ := result.GetString("provider_id")
 
-		key := nodeIdentifierNoProviderID{
-			Cluster: cluster,
-			Name:    name,
+
+		key := NodeIdentifier{
+			Cluster:    cluster,
+			Name:       name,
+			ProviderID: providerIDParser(providerID),
 		}
-		gpuCountMap[key] = gpuCost
+		gpuCountMap[key] = gpuCount
 	}
 
 	return gpuCountMap
@@ -604,8 +615,8 @@ func checkForKeyAndInitIfMissing(
 // the node map, but this would introduce a roughly quadratic time
 // complexity.
 func buildNodeMap(
-	cpuCostMap, ramCostMap, gpuCostMap map[NodeIdentifier]float64,
-	cpuCoresMap, ramBytesMap, ramUserPctMap, gpuCountMap,
+	cpuCostMap, ramCostMap, gpuCostMap, gpuCountMap map[NodeIdentifier]float64,
+	cpuCoresMap, ramBytesMap, ramUserPctMap,
 	ramSystemPctMap map[nodeIdentifierNoProviderID]float64,
 	cpuBreakdownMap map[nodeIdentifierNoProviderID]*ClusterCostsBreakdown,
 	activeDataMap map[NodeIdentifier]activeData,
@@ -633,6 +644,11 @@ func buildNodeMap(
 		nodeMap[id].GPUCost = cost
 	}
 
+	for id, count := range gpuCountMap {
+		checkForKeyAndInitIfMissing(nodeMap, id, clusterAndNameToType)
+		nodeMap[id].GPUCount = count
+	}
+
 	for id, preemptible := range preemptibleMap {
 		checkForKeyAndInitIfMissing(nodeMap, id, clusterAndNameToType)
 		nodeMap[id].Preemptible = preemptible
@@ -665,10 +681,6 @@ func buildNodeMap(
 			}
 		}
 
-		if GPUs, ok := gpuCountMap[clusterAndNameID]; ok {
-			nodePtr.GPUCount = GPUs
-		}
-
 		if ramBytes, ok := ramBytesMap[clusterAndNameID]; ok {
 			nodePtr.RAMBytes = ramBytes
 		}

+ 46 - 36
pkg/costmodel/cluster_helpers_test.go

@@ -38,7 +38,7 @@ func TestMergeTypeMaps(t *testing.T) {
 			},
 		},
 		{
-			name: "map2 empty",
+			name: "map1 empty",
 			map1: map[nodeIdentifierNoProviderID]string{},
 			map2: map[nodeIdentifierNoProviderID]string{
 				nodeIdentifierNoProviderID{
@@ -118,11 +118,13 @@ func TestMergeTypeMaps(t *testing.T) {
 	}
 
 	for _, testCase := range cases {
-		result := mergeTypeMaps(testCase.map1, testCase.map2)
+		t.Run(testCase.name, func(t *testing.T) {
+			result := mergeTypeMaps(testCase.map1, testCase.map2)
 
-		if !reflect.DeepEqual(result, testCase.expected) {
-			t.Errorf("mergeTypeMaps case %s failed. Got %+v but expected %+v", testCase.name, result, testCase.expected)
-		}
+			if !reflect.DeepEqual(result, testCase.expected) {
+				t.Errorf("mergeTypeMaps case %s failed. Got %+v but expected %+v", testCase.name, result, testCase.expected)
+			}
+		})
 	}
 }
 
@@ -132,8 +134,8 @@ func TestBuildNodeMap(t *testing.T) {
 		cpuCostMap           map[NodeIdentifier]float64
 		ramCostMap           map[NodeIdentifier]float64
 		gpuCostMap           map[NodeIdentifier]float64
+		gpuCountMap          map[NodeIdentifier]float64
 		cpuCoresMap          map[nodeIdentifierNoProviderID]float64
-		gpuCountMap          map[nodeIdentifierNoProviderID]float64
 		ramBytesMap          map[nodeIdentifierNoProviderID]float64
 		ramUserPctMap        map[nodeIdentifierNoProviderID]float64
 		ramSystemPctMap      map[nodeIdentifierNoProviderID]float64
@@ -309,6 +311,23 @@ func TestBuildNodeMap(t *testing.T) {
 					ProviderID: "prov_node2_A",
 				}: 3.1,
 			},
+			gpuCountMap: map[NodeIdentifier]float64{
+				NodeIdentifier{
+					Cluster:    "cluster1",
+					Name:       "node1",
+					ProviderID: "prov_node1_A",
+				}: 1.0,
+				NodeIdentifier{
+					Cluster:    "cluster1",
+					Name:       "node1",
+					ProviderID: "prov_node1_B",
+				}: 1.0,
+				NodeIdentifier{
+					Cluster:    "cluster1",
+					Name:       "node2",
+					ProviderID: "prov_node2_A",
+				}: 2.0,
+			},
 			cpuCoresMap: map[nodeIdentifierNoProviderID]float64{
 				nodeIdentifierNoProviderID{
 					Cluster: "cluster1",
@@ -319,16 +338,6 @@ func TestBuildNodeMap(t *testing.T) {
 					Name:    "node2",
 				}: 5.0,
 			},
-			gpuCountMap: map[nodeIdentifierNoProviderID]float64{
-				nodeIdentifierNoProviderID{
-					Cluster: "cluster1",
-					Name:    "node1",
-				}: 1.0,
-				nodeIdentifierNoProviderID{
-					Cluster: "cluster1",
-					Name:    "node2",
-				}: 2.0,
-			},
 			ramBytesMap: map[nodeIdentifierNoProviderID]float64{
 				nodeIdentifierNoProviderID{
 					Cluster: "cluster1",
@@ -461,7 +470,7 @@ func TestBuildNodeMap(t *testing.T) {
 					RAMCost:    0.09,
 					GPUCost:    0.8,
 					CPUCores:   2.0,
-					GPUCount: 1.0,
+					GPUCount:   1.0,
 					RAMBytes:   2048.0,
 					RAMBreakdown: &ClusterCostsBreakdown{
 						User:   30.0,
@@ -663,25 +672,26 @@ func TestBuildNodeMap(t *testing.T) {
 	}
 
 	for _, testCase := range cases {
+		t.Run(testCase.name, func(t *testing.T) {
+			result := buildNodeMap(
+				testCase.cpuCostMap, testCase.ramCostMap, testCase.gpuCostMap, testCase.gpuCountMap,
+				testCase.cpuCoresMap, testCase.ramBytesMap, testCase.ramUserPctMap,
+				testCase.ramSystemPctMap,
+				testCase.cpuBreakdownMap,
+				testCase.activeDataMap,
+				testCase.preemptibleMap,
+				testCase.labelsMap,
+				testCase.clusterAndNameToType,
+			)
 
-		result := buildNodeMap(
-			testCase.cpuCostMap, testCase.ramCostMap, testCase.gpuCostMap,
-			testCase.cpuCoresMap, testCase.ramBytesMap, testCase.ramUserPctMap, testCase.gpuCountMap,
-			testCase.ramSystemPctMap,
-			testCase.cpuBreakdownMap,
-			testCase.activeDataMap,
-			testCase.preemptibleMap,
-			testCase.labelsMap,
-			testCase.clusterAndNameToType,
-		)
-
-		if !reflect.DeepEqual(result, testCase.expected) {
-			t.Errorf("buildNodeMap case %s failed. Got %+v but expected %+v", testCase.name, result, testCase.expected)
+			if !reflect.DeepEqual(result, testCase.expected) {
+				t.Errorf("buildNodeMap case %s failed. Got %+v but expected %+v", testCase.name, result, testCase.expected)
 
-			// Use spew because we have to follow pointers to figure out
-			// what isn't matching up
-			t.Logf("Got: %s", spew.Sdump(result))
-			t.Logf("Expected: %s", spew.Sdump(testCase.expected))
-		}
+				// Use spew because we have to follow pointers to figure out
+				// what isn't matching up
+				t.Logf("Got: %s", spew.Sdump(result))
+				t.Logf("Expected: %s", spew.Sdump(testCase.expected))
+			}
+		})
 	}
-}
+}