|
|
@@ -118,12 +118,12 @@ type Disk struct {
|
|
|
Breakdown *ClusterCostsBreakdown
|
|
|
}
|
|
|
|
|
|
-func ClusterDisks(client prometheus.Client, provider cloud.Provider, duration, offset time.Duration) (map[string]*Disk, error) {
|
|
|
- durationStr := fmt.Sprintf("%dm", int64(duration.Minutes()))
|
|
|
- offsetStr := fmt.Sprintf(" offset %dm", int64(offset.Minutes()))
|
|
|
- if offset < time.Minute {
|
|
|
- offsetStr = ""
|
|
|
- }
|
|
|
+func ClusterDisks(client prometheus.Client, provider cloud.Provider, start, end time.Time) (map[string]*Disk, error) {
|
|
|
+ // Query for the duration between start and end
|
|
|
+ durStr := timeutil.DurationString(end.Sub(start))
|
|
|
+
|
|
|
+ // Start from the time "end", querying backwards
|
|
|
+ t := end
|
|
|
|
|
|
// minsPerResolution determines accuracy and resource use for the following
|
|
|
// queries. Smaller values (higher resolution) result in better accuracy,
|
|
|
@@ -140,22 +140,22 @@ func ClusterDisks(client prometheus.Client, provider cloud.Provider, duration, o
|
|
|
costPerGBHr := 0.04 / 730.0
|
|
|
|
|
|
ctx := prom.NewNamedContext(client, prom.ClusterContextName)
|
|
|
- queryPVCost := fmt.Sprintf(`avg(avg_over_time(pv_hourly_cost[%s]%s)) by (%s, persistentvolume,provider_id)`, durationStr, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryPVSize := fmt.Sprintf(`avg(avg_over_time(kube_persistentvolume_capacity_bytes[%s]%s)) by (%s, persistentvolume)`, durationStr, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryActiveMins := fmt.Sprintf(`count(pv_hourly_cost) by (%s, persistentvolume)[%s:%dm]%s`, env.GetPromClusterLabel(), durationStr, minsPerResolution, offsetStr)
|
|
|
-
|
|
|
- queryLocalStorageCost := fmt.Sprintf(`sum_over_time(sum(container_fs_limit_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm]%s) / 1024 / 1024 / 1024 * %f * %f`, env.GetPromClusterLabel(), durationStr, minsPerResolution, offsetStr, hourlyToCumulative, costPerGBHr)
|
|
|
- queryLocalStorageUsedCost := fmt.Sprintf(`sum_over_time(sum(container_fs_usage_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm]%s) / 1024 / 1024 / 1024 * %f * %f`, env.GetPromClusterLabel(), durationStr, minsPerResolution, offsetStr, hourlyToCumulative, costPerGBHr)
|
|
|
- queryLocalStorageBytes := fmt.Sprintf(`avg_over_time(sum(container_fs_limit_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm]%s)`, env.GetPromClusterLabel(), durationStr, minsPerResolution, offsetStr)
|
|
|
- queryLocalActiveMins := fmt.Sprintf(`count(node_total_hourly_cost) by (%s, node)[%s:%dm]%s`, env.GetPromClusterLabel(), durationStr, minsPerResolution, offsetStr)
|
|
|
-
|
|
|
- resChPVCost := ctx.Query(queryPVCost)
|
|
|
- resChPVSize := ctx.Query(queryPVSize)
|
|
|
- resChActiveMins := ctx.Query(queryActiveMins)
|
|
|
- resChLocalStorageCost := ctx.Query(queryLocalStorageCost)
|
|
|
- resChLocalStorageUsedCost := ctx.Query(queryLocalStorageUsedCost)
|
|
|
- resChLocalStorageBytes := ctx.Query(queryLocalStorageBytes)
|
|
|
- resChLocalActiveMins := ctx.Query(queryLocalActiveMins)
|
|
|
+ queryPVCost := fmt.Sprintf(`avg(avg_over_time(pv_hourly_cost[%s])) by (%s, persistentvolume,provider_id)`, durStr, env.GetPromClusterLabel())
|
|
|
+ queryPVSize := fmt.Sprintf(`avg(avg_over_time(kube_persistentvolume_capacity_bytes[%s])) by (%s, persistentvolume)`, durStr, env.GetPromClusterLabel())
|
|
|
+ queryActiveMins := fmt.Sprintf(`count(pv_hourly_cost) by (%s, persistentvolume)[%s:%dm]`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+
|
|
|
+ queryLocalStorageCost := fmt.Sprintf(`sum_over_time(sum(container_fs_limit_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`, env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
|
|
|
+ queryLocalStorageUsedCost := fmt.Sprintf(`sum_over_time(sum(container_fs_usage_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`, env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
|
|
|
+ queryLocalStorageBytes := fmt.Sprintf(`avg_over_time(sum(container_fs_limit_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm])`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+ queryLocalActiveMins := fmt.Sprintf(`count(node_total_hourly_cost) by (%s, node)[%s:%dm]`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+
|
|
|
+ resChPVCost := ctx.QueryAtTime(queryPVCost, t)
|
|
|
+ resChPVSize := ctx.QueryAtTime(queryPVSize, t)
|
|
|
+ resChActiveMins := ctx.QueryAtTime(queryActiveMins, t)
|
|
|
+ resChLocalStorageCost := ctx.QueryAtTime(queryLocalStorageCost, t)
|
|
|
+ resChLocalStorageUsedCost := ctx.QueryAtTime(queryLocalStorageUsedCost, t)
|
|
|
+ resChLocalStorageBytes := ctx.QueryAtTime(queryLocalStorageBytes, t)
|
|
|
+ resChLocalActiveMins := ctx.QueryAtTime(queryLocalActiveMins, t)
|
|
|
|
|
|
resPVCost, _ := resChPVCost.Await()
|
|
|
resPVSize, _ := resChPVSize.Await()
|
|
|
@@ -274,7 +274,7 @@ func ClusterDisks(client prometheus.Client, provider cloud.Provider, duration, o
|
|
|
}
|
|
|
|
|
|
s := time.Unix(int64(result.Values[0].Timestamp), 0)
|
|
|
- e := time.Unix(int64(result.Values[len(result.Values)-1].Timestamp), 0).Add(resolution)
|
|
|
+ e := time.Unix(int64(result.Values[len(result.Values)-1].Timestamp), 0)
|
|
|
mins := e.Sub(s).Minutes()
|
|
|
|
|
|
// TODO niko/assets if mins >= threshold, interpolate for missing data?
|
|
|
@@ -369,12 +369,12 @@ func costTimesMinute(activeDataMap map[NodeIdentifier]activeData, costMap map[No
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset time.Duration) (map[NodeIdentifier]*Node, error) {
|
|
|
- durationStr := fmt.Sprintf("%dm", int64(duration.Minutes()))
|
|
|
- offsetStr := fmt.Sprintf(" offset %dm", int64(offset.Minutes()))
|
|
|
- if offset < time.Minute {
|
|
|
- offsetStr = ""
|
|
|
- }
|
|
|
+func ClusterNodes(cp cloud.Provider, client prometheus.Client, start, end time.Time) (map[NodeIdentifier]*Node, error) {
|
|
|
+ // Query for the duration between start and end
|
|
|
+ durStr := timeutil.DurationString(end.Sub(start))
|
|
|
+
|
|
|
+ // Start from the time "end", querying backwards
|
|
|
+ t := end
|
|
|
|
|
|
// minsPerResolution determines accuracy and resource use for the following
|
|
|
// queries. Smaller values (higher resolution) result in better accuracy,
|
|
|
@@ -385,34 +385,34 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
|
|
|
requiredCtx := prom.NewNamedContext(client, prom.ClusterContextName)
|
|
|
optionalCtx := prom.NewNamedContext(client, prom.ClusterOptionalContextName)
|
|
|
|
|
|
- queryNodeCPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_cpu_hourly_cost[%s]%s)) by (%s, node, instance_type, provider_id)`, durationStr, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryNodeCPUCores := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_cpu_cores[%s]%s)) by (%s, node)`, durationStr, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryNodeRAMHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_ram_hourly_cost[%s]%s)) by (%s, node, instance_type, provider_id) / 1024 / 1024 / 1024`, durationStr, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryNodeRAMBytes := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_memory_bytes[%s]%s)) by (%s, node)`, durationStr, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryNodeGPUCount := fmt.Sprintf(`avg(avg_over_time(node_gpu_count[%s]%s)) by (%s, node, provider_id)`, durationStr, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryNodeGPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_gpu_hourly_cost[%s]%s)) by (%s, node, instance_type, provider_id)`, durationStr, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryNodeCPUModeTotal := fmt.Sprintf(`sum(rate(node_cpu_seconds_total[%s:%dm]%s)) by (kubernetes_node, %s, mode)`, durationStr, minsPerResolution, offsetStr, env.GetPromClusterLabel())
|
|
|
- queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system"}[%s:%dm]%s)) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`, durationStr, minsPerResolution, offsetStr, env.GetPromClusterLabel(), durationStr, minsPerResolution, offsetStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
- queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system"}[%s:%dm]%s)) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`, durationStr, minsPerResolution, offsetStr, env.GetPromClusterLabel(), durationStr, minsPerResolution, offsetStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
- queryActiveMins := fmt.Sprintf(`avg(node_total_hourly_cost) by (node, %s, provider_id)[%s:%dm]%s`, env.GetPromClusterLabel(), durationStr, minsPerResolution, offsetStr)
|
|
|
- queryIsSpot := fmt.Sprintf(`avg_over_time(kubecost_node_is_spot[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
|
|
|
- queryLabels := fmt.Sprintf(`count_over_time(kube_node_labels[%s:%dm]%s)`, durationStr, minsPerResolution, offsetStr)
|
|
|
+ queryNodeCPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_cpu_hourly_cost[%s])) by (%s, node, instance_type, provider_id)`, durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeCPUCores := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_cpu_cores[%s])) by (%s, node)`, durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_ram_hourly_cost[%s])) by (%s, node, instance_type, provider_id) / 1024 / 1024 / 1024`, durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMBytes := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_memory_bytes[%s])) by (%s, node)`, durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeGPUCount := fmt.Sprintf(`avg(avg_over_time(node_gpu_count[%s])) by (%s, node, provider_id)`, durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeGPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_gpu_hourly_cost[%s])) by (%s, node, instance_type, provider_id)`, durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeCPUModeTotal := fmt.Sprintf(`sum(rate(node_cpu_seconds_total[%s:%dm])) by (kubernetes_node, %s, mode)`, durStr, minsPerResolution, env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system"}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`, durStr, minsPerResolution, env.GetPromClusterLabel(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system"}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`, durStr, minsPerResolution, env.GetPromClusterLabel(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
+ queryActiveMins := fmt.Sprintf(`avg(node_total_hourly_cost) by (node, %s, provider_id)[%s:%dm]`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+ queryIsSpot := fmt.Sprintf(`avg_over_time(kubecost_node_is_spot[%s:%dm])`, durStr, minsPerResolution)
|
|
|
+ queryLabels := fmt.Sprintf(`count_over_time(kube_node_labels[%s:%dm])`, durStr, minsPerResolution)
|
|
|
|
|
|
// Return errors if these fail
|
|
|
- resChNodeCPUHourlyCost := requiredCtx.Query(queryNodeCPUHourlyCost)
|
|
|
- resChNodeCPUCores := requiredCtx.Query(queryNodeCPUCores)
|
|
|
- resChNodeRAMHourlyCost := requiredCtx.Query(queryNodeRAMHourlyCost)
|
|
|
- resChNodeRAMBytes := requiredCtx.Query(queryNodeRAMBytes)
|
|
|
- resChNodeGPUCount := requiredCtx.Query(queryNodeGPUCount)
|
|
|
- resChNodeGPUHourlyCost := requiredCtx.Query(queryNodeGPUHourlyCost)
|
|
|
- resChActiveMins := requiredCtx.Query(queryActiveMins)
|
|
|
- resChIsSpot := requiredCtx.Query(queryIsSpot)
|
|
|
+ resChNodeCPUHourlyCost := requiredCtx.QueryAtTime(queryNodeCPUHourlyCost, t)
|
|
|
+ resChNodeCPUCores := requiredCtx.QueryAtTime(queryNodeCPUCores, t)
|
|
|
+ resChNodeRAMHourlyCost := requiredCtx.QueryAtTime(queryNodeRAMHourlyCost, t)
|
|
|
+ resChNodeRAMBytes := requiredCtx.QueryAtTime(queryNodeRAMBytes, t)
|
|
|
+ resChNodeGPUCount := requiredCtx.QueryAtTime(queryNodeGPUCount, t)
|
|
|
+ resChNodeGPUHourlyCost := requiredCtx.QueryAtTime(queryNodeGPUHourlyCost, t)
|
|
|
+ resChActiveMins := requiredCtx.QueryAtTime(queryActiveMins, t)
|
|
|
+ resChIsSpot := requiredCtx.QueryAtTime(queryIsSpot, t)
|
|
|
|
|
|
// Do not return errors if these fail, but log warnings
|
|
|
- resChNodeCPUModeTotal := optionalCtx.Query(queryNodeCPUModeTotal)
|
|
|
- resChNodeRAMSystemPct := optionalCtx.Query(queryNodeRAMSystemPct)
|
|
|
- resChNodeRAMUserPct := optionalCtx.Query(queryNodeRAMUserPct)
|
|
|
- resChLabels := optionalCtx.Query(queryLabels)
|
|
|
+ resChNodeCPUModeTotal := optionalCtx.QueryAtTime(queryNodeCPUModeTotal, t)
|
|
|
+ resChNodeRAMSystemPct := optionalCtx.QueryAtTime(queryNodeRAMSystemPct, t)
|
|
|
+ resChNodeRAMUserPct := optionalCtx.QueryAtTime(queryNodeRAMUserPct, t)
|
|
|
+ resChLabels := optionalCtx.QueryAtTime(queryLabels, t)
|
|
|
|
|
|
resNodeCPUHourlyCost, _ := resChNodeCPUHourlyCost.Await()
|
|
|
resNodeCPUCores, _ := resChNodeCPUCores.Await()
|
|
|
@@ -475,6 +475,7 @@ func ClusterNodes(cp cloud.Provider, client prometheus.Client, duration, offset
|
|
|
preemptibleMap,
|
|
|
labelsMap,
|
|
|
clusterAndNameToType,
|
|
|
+ resolution,
|
|
|
)
|
|
|
|
|
|
c, err := cp.GetConfig()
|
|
|
@@ -513,6 +514,7 @@ type LoadBalancer struct {
|
|
|
Minutes float64
|
|
|
}
|
|
|
|
|
|
+// TODO rewrite this more-or-less altogether
|
|
|
func ClusterLoadBalancers(client prometheus.Client, duration, offset time.Duration) (map[string]*LoadBalancer, error) {
|
|
|
durationStr := fmt.Sprintf("%dm", int64(duration.Minutes()))
|
|
|
offsetStr := fmt.Sprintf(" offset %dm", int64(offset.Minutes()))
|
|
|
@@ -1105,7 +1107,7 @@ func pvCosts(diskMap map[string]*Disk, resolution time.Duration, resActiveMins,
|
|
|
}
|
|
|
}
|
|
|
s := time.Unix(int64(result.Values[0].Timestamp), 0)
|
|
|
- e := time.Unix(int64(result.Values[len(result.Values)-1].Timestamp), 0).Add(resolution)
|
|
|
+ e := time.Unix(int64(result.Values[len(result.Values)-1].Timestamp), 0)
|
|
|
mins := e.Sub(s).Minutes()
|
|
|
|
|
|
// TODO niko/assets if mins >= threshold, interpolate for missing data?
|