|
@@ -425,6 +425,32 @@ type nodeIdentifierNoProviderID struct {
|
|
|
Name string
|
|
Name string
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+func costTimesMinuteAndCount(activeDataMap map[NodeIdentifier]activeData, costMap map[NodeIdentifier]float64, resourceCountMap map[nodeIdentifierNoProviderID]float64) {
|
|
|
|
|
+ for k, v := range activeDataMap {
|
|
|
|
|
+ keyNon := nodeIdentifierNoProviderID{
|
|
|
|
|
+ Cluster: k.Cluster,
|
|
|
|
|
+ Name: k.Name,
|
|
|
|
|
+ }
|
|
|
|
|
+ if cost, ok := costMap[k]; ok {
|
|
|
|
|
+ minutes := v.minutes
|
|
|
|
|
+ count := 1.0
|
|
|
|
|
+ if c, ok := resourceCountMap[keyNon]; ok {
|
|
|
|
|
+ count = c
|
|
|
|
|
+ }
|
|
|
|
|
+ costMap[k] = cost * (minutes / 60) * count
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func costTimesMinute(activeDataMap map[NodeIdentifier]activeData, costMap map[NodeIdentifier]float64) {
|
|
|
|
|
+ for k, v := range activeDataMap {
|
|
|
|
|
+ if cost, ok := costMap[k]; ok {
|
|
|
|
|
+ minutes := v.minutes
|
|
|
|
|
+ costMap[k] = cost * (minutes / 60)
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset time.Duration) (map[NodeIdentifier]*Node, error) {
|
|
func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset time.Duration) (map[NodeIdentifier]*Node, error) {
|
|
|
durationStr := fmt.Sprintf("%dm", int64(duration.Minutes()))
|
|
durationStr := fmt.Sprintf("%dm", int64(duration.Minutes()))
|
|
|
offsetStr := fmt.Sprintf(" offset %dm", int64(offset.Minutes()))
|
|
offsetStr := fmt.Sprintf(" offset %dm", int64(offset.Minutes()))
|
|
@@ -438,20 +464,15 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
|
|
|
minsPerResolution := 1
|
|
minsPerResolution := 1
|
|
|
resolution := time.Duration(minsPerResolution) * time.Minute
|
|
resolution := time.Duration(minsPerResolution) * time.Minute
|
|
|
|
|
|
|
|
- // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
|
|
|
|
|
- // value, converts it to a cumulative value; i.e.
|
|
|
|
|
- // [$/hr] * [min/res]*[hr/min] = [$/res]
|
|
|
|
|
- hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
|
|
|
|
|
-
|
|
|
|
|
requiredCtx := prom.NewContext(client)
|
|
requiredCtx := prom.NewContext(client)
|
|
|
optionalCtx := prom.NewContext(client)
|
|
optionalCtx := prom.NewContext(client)
|
|
|
|
|
|
|
|
- queryNodeCPUCost := fmt.Sprintf(`sum_over_time((avg(kube_node_status_capacity_cpu_cores) by (cluster_id, node) * on(node, cluster_id) group_right avg(node_cpu_hourly_cost) by (cluster_id, node, instance_type, provider_id))[%s:%dm]%s) * %f`, durationStr, minsPerResolution, offsetStr, hourlyToCumulative)
|
|
|
|
|
- queryNodeCPUCores := fmt.Sprintf(`avg_over_time(avg(kube_node_status_capacity_cpu_cores) by (cluster_id, node)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
|
|
|
|
|
- queryNodeRAMCost := fmt.Sprintf(`sum_over_time((avg(kube_node_status_capacity_memory_bytes) by (cluster_id, node) * on(cluster_id, node) group_right avg(node_ram_hourly_cost) by (cluster_id, node, instance_type, provider_id))[%s:%dm]%s) / 1024 / 1024 / 1024 * %f`, durationStr, minsPerResolution, offsetStr, hourlyToCumulative)
|
|
|
|
|
- queryNodeRAMBytes := fmt.Sprintf(`avg_over_time(avg(kube_node_status_capacity_memory_bytes) by (cluster_id, node)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
|
|
|
|
|
- queryNodeGPUCount := fmt.Sprintf(`avg_over_time(avg(node_gpu_count) by (cluster_id, node, provider_id)[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
|
|
|
|
|
- queryNodeGPUHourlySum := fmt.Sprintf(`sum_over_time(avg(node_gpu_hourly_cost) by (cluster_id, node, instance_type, provider_id)[%s:%dm]%s) * %f`, durationStr, minsPerResolution, offsetStr, hourlyToCumulative)
|
|
|
|
|
|
|
+ queryNodeCPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_cpu_hourly_cost[%s]%s)) by (cluster_id, node, instance_type, provider_id)`, durationStr, offsetStr)
|
|
|
|
|
+ queryNodeCPUCores := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_cpu_cores[%s]%s)) by (cluster_id, node)`, durationStr, offsetStr)
|
|
|
|
|
+ queryNodeRAMHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_ram_hourly_cost[%s]%s)) by (cluster_id, node, instance_type, provider_id) / 1024 / 1024 / 1024`, durationStr, offsetStr)
|
|
|
|
|
+ queryNodeRAMBytes := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_memory_bytes[%s]%s)) by (cluster_id, node)`, durationStr, offsetStr)
|
|
|
|
|
+ queryNodeGPUCount := fmt.Sprintf(`avg(avg_over_time(node_gpu_count[%s]%s)) by (cluster_id, node, provider_id)`, durationStr, offsetStr)
|
|
|
|
|
+ queryNodeGPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_gpu_hourly_cost[%s]%s)) by (cluster_id, node, instance_type, provider_id)`, durationStr, offsetStr)
|
|
|
queryNodeCPUModeTotal := fmt.Sprintf(`sum(rate(node_cpu_seconds_total[%s:%dm]%s)) by (kubernetes_node, cluster_id, mode)`, durationStr, minsPerResolution, offsetStr)
|
|
queryNodeCPUModeTotal := fmt.Sprintf(`sum(rate(node_cpu_seconds_total[%s:%dm]%s)) by (kubernetes_node, cluster_id, mode)`, durationStr, minsPerResolution, offsetStr)
|
|
|
queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, cluster_id), "instance", "$1", "node", "(.*)")) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
|
|
queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, cluster_id), "instance", "$1", "node", "(.*)")) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
|
|
|
queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, cluster_id), "instance", "$1", "node", "(.*)")) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
|
|
queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system"}[%s:%dm]%s)) by (instance, cluster_id) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, cluster_id), "instance", "$1", "node", "(.*)")) by (instance, cluster_id)`, durationStr, minsPerResolution, offsetStr, durationStr, minsPerResolution, offsetStr)
|
|
@@ -460,12 +481,12 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
|
|
|
queryLabels := fmt.Sprintf(`count_over_time(kube_node_labels[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
|
|
queryLabels := fmt.Sprintf(`count_over_time(kube_node_labels[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
|
|
|
|
|
|
|
|
// Return errors if these fail
|
|
// Return errors if these fail
|
|
|
- resChNodeCPUCost := requiredCtx.Query(queryNodeCPUCost)
|
|
|
|
|
|
|
+ resChNodeCPUHourlyCost := requiredCtx.Query(queryNodeCPUHourlyCost)
|
|
|
resChNodeCPUCores := requiredCtx.Query(queryNodeCPUCores)
|
|
resChNodeCPUCores := requiredCtx.Query(queryNodeCPUCores)
|
|
|
- resChNodeRAMCost := requiredCtx.Query(queryNodeRAMCost)
|
|
|
|
|
|
|
+ resChNodeRAMHourlyCost := requiredCtx.Query(queryNodeRAMHourlyCost)
|
|
|
resChNodeRAMBytes := requiredCtx.Query(queryNodeRAMBytes)
|
|
resChNodeRAMBytes := requiredCtx.Query(queryNodeRAMBytes)
|
|
|
resChNodeGPUCount := requiredCtx.Query(queryNodeGPUCount)
|
|
resChNodeGPUCount := requiredCtx.Query(queryNodeGPUCount)
|
|
|
- resChNodeGPUHourlySum := requiredCtx.Query(queryNodeGPUHourlySum)
|
|
|
|
|
|
|
+ resChNodeGPUHourlyCost := requiredCtx.Query(queryNodeGPUHourlyCost)
|
|
|
resChActiveMins := requiredCtx.Query(queryActiveMins)
|
|
resChActiveMins := requiredCtx.Query(queryActiveMins)
|
|
|
resChIsSpot := requiredCtx.Query(queryIsSpot)
|
|
resChIsSpot := requiredCtx.Query(queryIsSpot)
|
|
|
|
|
|
|
@@ -475,11 +496,11 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
|
|
|
resChNodeRAMUserPct := optionalCtx.Query(queryNodeRAMUserPct)
|
|
resChNodeRAMUserPct := optionalCtx.Query(queryNodeRAMUserPct)
|
|
|
resChLabels := optionalCtx.Query(queryLabels)
|
|
resChLabels := optionalCtx.Query(queryLabels)
|
|
|
|
|
|
|
|
- resNodeCPUCost, _ := resChNodeCPUCost.Await()
|
|
|
|
|
|
|
+ resNodeCPUHourlyCost, _ := resChNodeCPUHourlyCost.Await()
|
|
|
resNodeCPUCores, _ := resChNodeCPUCores.Await()
|
|
resNodeCPUCores, _ := resChNodeCPUCores.Await()
|
|
|
resNodeGPUCount, _ := resChNodeGPUCount.Await()
|
|
resNodeGPUCount, _ := resChNodeGPUCount.Await()
|
|
|
- resNodeGPUHourlySum, _ := resChNodeGPUHourlySum.Await()
|
|
|
|
|
- resNodeRAMCost, _ := resChNodeRAMCost.Await()
|
|
|
|
|
|
|
+ resNodeGPUHourlyCost, _ := resChNodeGPUHourlyCost.Await()
|
|
|
|
|
+ resNodeRAMHourlyCost, _ := resChNodeRAMHourlyCost.Await()
|
|
|
resNodeRAMBytes, _ := resChNodeRAMBytes.Await()
|
|
resNodeRAMBytes, _ := resChNodeRAMBytes.Await()
|
|
|
resIsSpot, _ := resChIsSpot.Await()
|
|
resIsSpot, _ := resChIsSpot.Await()
|
|
|
resNodeCPUModeTotal, _ := resChNodeCPUModeTotal.Await()
|
|
resNodeCPUModeTotal, _ := resChNodeCPUModeTotal.Await()
|
|
@@ -501,11 +522,13 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
|
|
|
return nil, requiredCtx.ErrorCollection()
|
|
return nil, requiredCtx.ErrorCollection()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ activeDataMap := buildActiveDataMap(resActiveMins, resolution, cp.ParseID)
|
|
|
|
|
+
|
|
|
gpuCountMap := buildGPUCountMap(resNodeGPUCount, cp.ParseID)
|
|
gpuCountMap := buildGPUCountMap(resNodeGPUCount, cp.ParseID)
|
|
|
|
|
|
|
|
- cpuCostMap, clusterAndNameToType1 := buildCPUCostMap(resNodeCPUCost, cp.ParseID)
|
|
|
|
|
- ramCostMap, clusterAndNameToType2 := buildRAMCostMap(resNodeRAMCost, cp.ParseID)
|
|
|
|
|
- gpuCostMap, clusterAndNameToType3 := buildGPUCostMap(resNodeGPUHourlySum, gpuCountMap, cp.ParseID)
|
|
|
|
|
|
|
+ cpuCostMap, clusterAndNameToType1 := buildCPUCostMap(resNodeCPUHourlyCost, cp.ParseID)
|
|
|
|
|
+ ramCostMap, clusterAndNameToType2 := buildRAMCostMap(resNodeRAMHourlyCost, cp.ParseID)
|
|
|
|
|
+ gpuCostMap, clusterAndNameToType3 := buildGPUCostMap(resNodeGPUHourlyCost, gpuCountMap, cp.ParseID)
|
|
|
|
|
|
|
|
clusterAndNameToTypeIntermediate := mergeTypeMaps(clusterAndNameToType1, clusterAndNameToType2)
|
|
clusterAndNameToTypeIntermediate := mergeTypeMaps(clusterAndNameToType1, clusterAndNameToType2)
|
|
|
clusterAndNameToType := mergeTypeMaps(clusterAndNameToTypeIntermediate, clusterAndNameToType3)
|
|
clusterAndNameToType := mergeTypeMaps(clusterAndNameToTypeIntermediate, clusterAndNameToType3)
|
|
@@ -518,10 +541,13 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
|
|
|
ramSystemPctMap := buildRAMSystemPctMap(resNodeRAMSystemPct)
|
|
ramSystemPctMap := buildRAMSystemPctMap(resNodeRAMSystemPct)
|
|
|
|
|
|
|
|
cpuBreakdownMap := buildCPUBreakdownMap(resNodeCPUModeTotal)
|
|
cpuBreakdownMap := buildCPUBreakdownMap(resNodeCPUModeTotal)
|
|
|
- activeDataMap := buildActiveDataMap(resActiveMins, resolution, cp.ParseID)
|
|
|
|
|
preemptibleMap := buildPreemptibleMap(resIsSpot, cp.ParseID)
|
|
preemptibleMap := buildPreemptibleMap(resIsSpot, cp.ParseID)
|
|
|
labelsMap := buildLabelsMap(resLabels)
|
|
labelsMap := buildLabelsMap(resLabels)
|
|
|
|
|
|
|
|
|
|
+ costTimesMinuteAndCount(activeDataMap, cpuCostMap, cpuCoresMap)
|
|
|
|
|
+ costTimesMinuteAndCount(activeDataMap, ramCostMap, ramBytesMap)
|
|
|
|
|
+ costTimesMinute(activeDataMap, gpuCostMap) // there's no need to do a weird "nodeIdentifierNoProviderID" type match since gpuCounts have a providerID
|
|
|
|
|
+
|
|
|
nodeMap := buildNodeMap(
|
|
nodeMap := buildNodeMap(
|
|
|
cpuCostMap, ramCostMap, gpuCostMap, gpuCountMap,
|
|
cpuCostMap, ramCostMap, gpuCostMap, gpuCountMap,
|
|
|
cpuCoresMap, ramBytesMap, ramUserPctMap,
|
|
cpuCoresMap, ramBytesMap, ramUserPctMap,
|