|
|
@@ -19,26 +19,26 @@ import (
|
|
|
|
|
|
const (
|
|
|
queryClusterCores = `sum(
|
|
|
- avg(avg_over_time(kube_node_status_capacity_cpu_cores[%s] %s)) by (node, %s) * avg(avg_over_time(node_cpu_hourly_cost[%s] %s)) by (node, %s) * 730 +
|
|
|
- avg(avg_over_time(node_gpu_hourly_cost[%s] %s)) by (node, %s) * 730
|
|
|
+ avg(avg_over_time(kube_node_status_capacity_cpu_cores{%s}[%s] %s)) by (node, %s) * avg(avg_over_time(node_cpu_hourly_cost{%s}[%s] %s)) by (node, %s) * 730 +
|
|
|
+ avg(avg_over_time(node_gpu_hourly_cost{%s}[%s] %s)) by (node, %s) * 730
|
|
|
) by (%s)`
|
|
|
|
|
|
queryClusterRAM = `sum(
|
|
|
- avg(avg_over_time(kube_node_status_capacity_memory_bytes[%s] %s)) by (node, %s) / 1024 / 1024 / 1024 * avg(avg_over_time(node_ram_hourly_cost[%s] %s)) by (node, %s) * 730
|
|
|
+ avg(avg_over_time(kube_node_status_capacity_memory_bytes{%s}[%s] %s)) by (node, %s) / 1024 / 1024 / 1024 * avg(avg_over_time(node_ram_hourly_cost{%s}[%s] %s)) by (node, %s) * 730
|
|
|
) by (%s)`
|
|
|
|
|
|
queryStorage = `sum(
|
|
|
- avg(avg_over_time(pv_hourly_cost[%s] %s)) by (persistentvolume, %s) * 730
|
|
|
- * avg(avg_over_time(kube_persistentvolume_capacity_bytes[%s] %s)) by (persistentvolume, %s) / 1024 / 1024 / 1024
|
|
|
+ avg(avg_over_time(pv_hourly_cost{%s}[%s] %s)) by (persistentvolume, %s) * 730
|
|
|
+ * avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[%s] %s)) by (persistentvolume, %s) / 1024 / 1024 / 1024
|
|
|
) by (%s) %s`
|
|
|
|
|
|
- queryTotal = `sum(avg(node_total_hourly_cost) by (node, %s)) * 730 +
|
|
|
+ queryTotal = `sum(avg(node_total_hourly_cost{%s}) by (node, %s)) * 730 +
|
|
|
sum(
|
|
|
- avg(avg_over_time(pv_hourly_cost[1h])) by (persistentvolume, %s) * 730
|
|
|
- * avg(avg_over_time(kube_persistentvolume_capacity_bytes[1h])) by (persistentvolume, %s) / 1024 / 1024 / 1024
|
|
|
+ avg(avg_over_time(pv_hourly_cost{%s}[1h])) by (persistentvolume, %s) * 730
|
|
|
+ * avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[1h])) by (persistentvolume, %s) / 1024 / 1024 / 1024
|
|
|
) by (%s) %s`
|
|
|
|
|
|
- queryNodes = `sum(avg(node_total_hourly_cost) by (node, %s)) * 730 %s`
|
|
|
+ queryNodes = `sum(avg(node_total_hourly_cost{%s}) by (node, %s)) * 730 %s`
|
|
|
)
|
|
|
|
|
|
const maxLocalDiskSize = 200 // AWS limits root disks to 100 Gi, and occasional metric errors in filesystem size should not contribute to large costs.
|
|
|
@@ -171,19 +171,19 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
|
|
|
costPerGBHr := 0.04 / 730.0
|
|
|
|
|
|
ctx := prom.NewNamedContext(client, prom.ClusterContextName)
|
|
|
- queryPVCost := fmt.Sprintf(`avg(avg_over_time(pv_hourly_cost[%s])) by (%s, persistentvolume,provider_id)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryPVSize := fmt.Sprintf(`avg(avg_over_time(kube_persistentvolume_capacity_bytes[%s])) by (%s, persistentvolume)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryActiveMins := fmt.Sprintf(`avg(kube_persistentvolume_capacity_bytes) by (%s, persistentvolume)[%s:%dm]`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
- queryPVStorageClass := fmt.Sprintf(`avg(avg_over_time(kubecost_pv_info[%s])) by (%s, persistentvolume, storageclass)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryPVUsedAvg := fmt.Sprintf(`avg(avg_over_time(kubelet_volume_stats_used_bytes[%s])) by (%s, persistentvolumeclaim, namespace)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryPVUsedMax := fmt.Sprintf(`max(max_over_time(kubelet_volume_stats_used_bytes[%s])) by (%s, persistentvolumeclaim, namespace)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryPVCInfo := fmt.Sprintf(`avg(avg_over_time(kube_persistentvolumeclaim_info[%s])) by (%s, volumename, persistentvolumeclaim, namespace)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryLocalStorageCost := fmt.Sprintf(`sum_over_time(sum(container_fs_limit_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`, env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
|
|
|
- queryLocalStorageUsedCost := fmt.Sprintf(`sum_over_time(sum(container_fs_usage_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`, env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
|
|
|
- queryLocalStorageUsedAvg := fmt.Sprintf(`avg(avg_over_time(container_fs_usage_bytes{device!="tmpfs", id="/"}[%s])) by (instance, %s)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryLocalStorageUsedMax := fmt.Sprintf(`max(max_over_time(container_fs_usage_bytes{device!="tmpfs", id="/"}[%s])) by (instance, %s)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryLocalStorageBytes := fmt.Sprintf(`avg_over_time(sum(container_fs_limit_bytes{device!="tmpfs", id="/"}) by (instance, %s)[%s:%dm])`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
- queryLocalActiveMins := fmt.Sprintf(`count(node_total_hourly_cost) by (%s, node)[%s:%dm]`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+ queryPVCost := fmt.Sprintf(`avg(avg_over_time(pv_hourly_cost{%s}[%s])) by (%s, persistentvolume,provider_id)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryPVSize := fmt.Sprintf(`avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[%s])) by (%s, persistentvolume)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryActiveMins := fmt.Sprintf(`avg(kube_persistentvolume_capacity_bytes{%s}) by (%s, persistentvolume)[%s:%dm]`, env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+ queryPVStorageClass := fmt.Sprintf(`avg(avg_over_time(kubecost_pv_info{%s}[%s])) by (%s, persistentvolume, storageclass)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryPVUsedAvg := fmt.Sprintf(`avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryPVUsedMax := fmt.Sprintf(`max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryPVCInfo := fmt.Sprintf(`avg(avg_over_time(kube_persistentvolumeclaim_info{%s}[%s])) by (%s, volumename, persistentvolumeclaim, namespace)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryLocalStorageCost := fmt.Sprintf(`sum_over_time(sum(container_fs_limit_bytes{device!="tmpfs", id="/", %s}) by (instance, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`, env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
|
|
|
+ queryLocalStorageUsedCost := fmt.Sprintf(`sum_over_time(sum(container_fs_usage_bytes{device!="tmpfs", id="/", %s}) by (instance, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`, env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
|
|
|
+ queryLocalStorageUsedAvg := fmt.Sprintf(`avg(avg_over_time(container_fs_usage_bytes{device!="tmpfs", id="/", %s}[%s])) by (instance, %s)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryLocalStorageUsedMax := fmt.Sprintf(`max(max_over_time(container_fs_usage_bytes{device!="tmpfs", id="/", %s}[%s])) by (instance, %s)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryLocalStorageBytes := fmt.Sprintf(`avg_over_time(sum(container_fs_limit_bytes{device!="tmpfs", id="/", %s}) by (instance, %s)[%s:%dm])`, env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+ queryLocalActiveMins := fmt.Sprintf(`count(node_total_hourly_cost{%s}) by (%s, node)[%s:%dm]`, env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
|
|
|
resChPVCost := ctx.QueryAtTime(queryPVCost, t)
|
|
|
resChPVSize := ctx.QueryAtTime(queryPVSize, t)
|
|
|
@@ -472,6 +472,10 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
|
|
|
return diskMap, nil
|
|
|
}
|
|
|
|
|
|
+type NodeOverhead struct {
|
|
|
+ CpuOverheadFraction float64
|
|
|
+ RamOverheadFraction float64
|
|
|
+}
|
|
|
type Node struct {
|
|
|
Cluster string
|
|
|
Name string
|
|
|
@@ -494,6 +498,7 @@ type Node struct {
|
|
|
CostPerCPUHr float64
|
|
|
CostPerRAMGiBHr float64
|
|
|
CostPerGPUHr float64
|
|
|
+ Overhead *NodeOverhead
|
|
|
}
|
|
|
|
|
|
// GKE lies about the number of cores e2 nodes have. This table
|
|
|
@@ -566,24 +571,28 @@ func ClusterNodes(cp models.Provider, client prometheus.Client, start, end time.
|
|
|
requiredCtx := prom.NewNamedContext(client, prom.ClusterContextName)
|
|
|
optionalCtx := prom.NewNamedContext(client, prom.ClusterOptionalContextName)
|
|
|
|
|
|
- queryNodeCPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_cpu_hourly_cost[%s])) by (%s, node, instance_type, provider_id)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryNodeCPUCores := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_cpu_cores[%s])) by (%s, node)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryNodeRAMHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_ram_hourly_cost[%s])) by (%s, node, instance_type, provider_id) / 1024 / 1024 / 1024`, durStr, env.GetPromClusterLabel())
|
|
|
- queryNodeRAMBytes := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_memory_bytes[%s])) by (%s, node)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryNodeGPUCount := fmt.Sprintf(`avg(avg_over_time(node_gpu_count[%s])) by (%s, node, provider_id)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryNodeGPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_gpu_hourly_cost[%s])) by (%s, node, instance_type, provider_id)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryNodeCPUModeTotal := fmt.Sprintf(`sum(rate(node_cpu_seconds_total[%s:%dm])) by (kubernetes_node, %s, mode)`, durStr, minsPerResolution, env.GetPromClusterLabel())
|
|
|
- queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system"}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`, durStr, minsPerResolution, env.GetPromClusterLabel(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
- queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system"}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`, durStr, minsPerResolution, env.GetPromClusterLabel(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
- queryActiveMins := fmt.Sprintf(`avg(node_total_hourly_cost) by (node, %s, provider_id)[%s:%dm]`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
- queryIsSpot := fmt.Sprintf(`avg_over_time(kubecost_node_is_spot[%s:%dm])`, durStr, minsPerResolution)
|
|
|
- queryLabels := fmt.Sprintf(`count_over_time(kube_node_labels[%s:%dm])`, durStr, minsPerResolution)
|
|
|
+ queryNodeCPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (%s, node, instance_type, provider_id)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeCPUCoresCapacity := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_cpu_cores{%s}[%s])) by (%s, node)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeCPUCoresAllocatable := fmt.Sprintf(`avg(avg_over_time(kube_node_status_allocatable_cpu_cores{%s}[%s])) by (%s, node)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (%s, node, instance_type, provider_id) / 1024 / 1024 / 1024`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMBytesCapacity := fmt.Sprintf(`avg(avg_over_time(kube_node_status_capacity_memory_bytes{%s}[%s])) by (%s, node)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMBytesAllocatable := fmt.Sprintf(`avg(avg_over_time(kube_node_status_allocatable_memory_bytes{%s}[%s])) by (%s, node)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeGPUCount := fmt.Sprintf(`avg(avg_over_time(node_gpu_count{%s}[%s])) by (%s, node, provider_id)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeGPUHourlyCost := fmt.Sprintf(`avg(avg_over_time(node_gpu_hourly_cost{%s}[%s])) by (%s, node, instance_type, provider_id)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryNodeCPUModeTotal := fmt.Sprintf(`sum(rate(node_cpu_seconds_total{%s}[%s:%dm])) by (kubernetes_node, %s, mode)`, env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMSystemPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`, env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
+ queryNodeRAMUserPct := fmt.Sprintf(`sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`, env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
+ queryActiveMins := fmt.Sprintf(`avg(node_total_hourly_cost{%s}) by (node, %s, provider_id)[%s:%dm]`, env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+ queryIsSpot := fmt.Sprintf(`avg_over_time(kubecost_node_is_spot{%s}[%s:%dm])`, env.GetPromClusterFilter(), durStr, minsPerResolution)
|
|
|
+ queryLabels := fmt.Sprintf(`count_over_time(kube_node_labels{%s}[%s:%dm])`, env.GetPromClusterFilter(), durStr, minsPerResolution)
|
|
|
|
|
|
// Return errors if these fail
|
|
|
resChNodeCPUHourlyCost := requiredCtx.QueryAtTime(queryNodeCPUHourlyCost, t)
|
|
|
- resChNodeCPUCores := requiredCtx.QueryAtTime(queryNodeCPUCores, t)
|
|
|
+ resChNodeCPUCoresCapacity := requiredCtx.QueryAtTime(queryNodeCPUCoresCapacity, t)
|
|
|
+ resChNodeCPUCoresAllocatable := requiredCtx.QueryAtTime(queryNodeCPUCoresAllocatable, t)
|
|
|
resChNodeRAMHourlyCost := requiredCtx.QueryAtTime(queryNodeRAMHourlyCost, t)
|
|
|
- resChNodeRAMBytes := requiredCtx.QueryAtTime(queryNodeRAMBytes, t)
|
|
|
+ resChNodeRAMBytesCapacity := requiredCtx.QueryAtTime(queryNodeRAMBytesCapacity, t)
|
|
|
+ resChNodeRAMBytesAllocatable := requiredCtx.QueryAtTime(queryNodeRAMBytesAllocatable, t)
|
|
|
resChNodeGPUCount := requiredCtx.QueryAtTime(queryNodeGPUCount, t)
|
|
|
resChNodeGPUHourlyCost := requiredCtx.QueryAtTime(queryNodeGPUHourlyCost, t)
|
|
|
resChActiveMins := requiredCtx.QueryAtTime(queryActiveMins, t)
|
|
|
@@ -596,11 +605,13 @@ func ClusterNodes(cp models.Provider, client prometheus.Client, start, end time.
|
|
|
resChLabels := optionalCtx.QueryAtTime(queryLabels, t)
|
|
|
|
|
|
resNodeCPUHourlyCost, _ := resChNodeCPUHourlyCost.Await()
|
|
|
- resNodeCPUCores, _ := resChNodeCPUCores.Await()
|
|
|
+ resNodeCPUCoresCapacity, _ := resChNodeCPUCoresCapacity.Await()
|
|
|
+ resNodeCPUCoresAllocatable, _ := resChNodeCPUCoresAllocatable.Await()
|
|
|
resNodeGPUCount, _ := resChNodeGPUCount.Await()
|
|
|
resNodeGPUHourlyCost, _ := resChNodeGPUHourlyCost.Await()
|
|
|
resNodeRAMHourlyCost, _ := resChNodeRAMHourlyCost.Await()
|
|
|
- resNodeRAMBytes, _ := resChNodeRAMBytes.Await()
|
|
|
+ resNodeRAMBytesCapacity, _ := resChNodeRAMBytesCapacity.Await()
|
|
|
+ resNodeRAMBytesAllocatable, _ := resChNodeRAMBytesAllocatable.Await()
|
|
|
resIsSpot, _ := resChIsSpot.Await()
|
|
|
resNodeCPUModeTotal, _ := resChNodeCPUModeTotal.Await()
|
|
|
resNodeRAMSystemPct, _ := resChNodeRAMSystemPct.Await()
|
|
|
@@ -633,8 +644,12 @@ func ClusterNodes(cp models.Provider, client prometheus.Client, start, end time.
|
|
|
clusterAndNameToTypeIntermediate := mergeTypeMaps(clusterAndNameToType1, clusterAndNameToType2)
|
|
|
clusterAndNameToType := mergeTypeMaps(clusterAndNameToTypeIntermediate, clusterAndNameToType3)
|
|
|
|
|
|
- cpuCoresMap := buildCPUCoresMap(resNodeCPUCores)
|
|
|
- ramBytesMap := buildRAMBytesMap(resNodeRAMBytes)
|
|
|
+ cpuCoresCapacityMap := buildCPUCoresMap(resNodeCPUCoresCapacity)
|
|
|
+ ramBytesCapacityMap := buildRAMBytesMap(resNodeRAMBytesCapacity)
|
|
|
+
|
|
|
+ cpuCoresAllocatableMap := buildCPUCoresMap(resNodeCPUCoresAllocatable)
|
|
|
+ ramBytesAllocatableMap := buildRAMBytesMap(resNodeRAMBytesAllocatable)
|
|
|
+ overheadMap := buildOverheadMap(ramBytesCapacityMap, ramBytesAllocatableMap, cpuCoresCapacityMap, cpuCoresAllocatableMap)
|
|
|
|
|
|
ramUserPctMap := buildRAMUserPctMap(resNodeRAMUserPct)
|
|
|
ramSystemPctMap := buildRAMSystemPctMap(resNodeRAMSystemPct)
|
|
|
@@ -643,13 +658,13 @@ func ClusterNodes(cp models.Provider, client prometheus.Client, start, end time.
|
|
|
|
|
|
labelsMap := buildLabelsMap(resLabels)
|
|
|
|
|
|
- costTimesMinuteAndCount(activeDataMap, cpuCostMap, cpuCoresMap)
|
|
|
- costTimesMinuteAndCount(activeDataMap, ramCostMap, ramBytesMap)
|
|
|
+ costTimesMinuteAndCount(activeDataMap, cpuCostMap, cpuCoresCapacityMap)
|
|
|
+ costTimesMinuteAndCount(activeDataMap, ramCostMap, ramBytesCapacityMap)
|
|
|
costTimesMinute(activeDataMap, gpuCostMap) // there's no need to do a weird "nodeIdentifierNoProviderID" type match since gpuCounts have a providerID
|
|
|
|
|
|
nodeMap := buildNodeMap(
|
|
|
cpuCostMap, ramCostMap, gpuCostMap, gpuCountMap,
|
|
|
- cpuCoresMap, ramBytesMap, ramUserPctMap,
|
|
|
+ cpuCoresCapacityMap, ramBytesCapacityMap, ramUserPctMap,
|
|
|
ramSystemPctMap,
|
|
|
cpuBreakdownMap,
|
|
|
activeDataMap,
|
|
|
@@ -657,6 +672,7 @@ func ClusterNodes(cp models.Provider, client prometheus.Client, start, end time.
|
|
|
labelsMap,
|
|
|
clusterAndNameToType,
|
|
|
resolution,
|
|
|
+ overheadMap,
|
|
|
)
|
|
|
|
|
|
c, err := cp.GetConfig()
|
|
|
@@ -726,8 +742,8 @@ func ClusterLoadBalancers(client prometheus.Client, start, end time.Time) (map[L
|
|
|
|
|
|
ctx := prom.NewNamedContext(client, prom.ClusterContextName)
|
|
|
|
|
|
- queryLBCost := fmt.Sprintf(`avg(avg_over_time(kubecost_load_balancer_cost[%s])) by (namespace, service_name, %s, ingress_ip)`, durStr, env.GetPromClusterLabel())
|
|
|
- queryActiveMins := fmt.Sprintf(`avg(kubecost_load_balancer_cost) by (namespace, service_name, %s, ingress_ip)[%s:%dm]`, env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
+ queryLBCost := fmt.Sprintf(`avg(avg_over_time(kubecost_load_balancer_cost{%s}[%s])) by (namespace, service_name, %s, ingress_ip)`, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
+ queryActiveMins := fmt.Sprintf(`avg(kubecost_load_balancer_cost{%s}) by (namespace, service_name, %s, ingress_ip)[%s:%dm]`, env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
|
|
|
|
|
|
resChLBCost := ctx.QueryAtTime(queryLBCost, t)
|
|
|
resChActiveMins := ctx.QueryAtTime(queryActiveMins, t)
|
|
|
@@ -862,49 +878,49 @@ func (a *Accesses) ComputeClusterCosts(client prometheus.Client, provider models
|
|
|
hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
|
|
|
|
|
|
const fmtQueryDataCount = `
|
|
|
- count_over_time(sum(kube_node_status_capacity_cpu_cores) by (%s)[%s:%dm]%s) * %d
|
|
|
+ count_over_time(sum(kube_node_status_capacity_cpu_cores{%s}) by (%s)[%s:%dm]%s) * %d
|
|
|
`
|
|
|
|
|
|
const fmtQueryTotalGPU = `
|
|
|
sum(
|
|
|
- sum_over_time(node_gpu_hourly_cost[%s:%dm]%s) * %f
|
|
|
+ sum_over_time(node_gpu_hourly_cost{%s}[%s:%dm]%s) * %f
|
|
|
) by (%s)
|
|
|
`
|
|
|
|
|
|
const fmtQueryTotalCPU = `
|
|
|
sum(
|
|
|
- sum_over_time(avg(kube_node_status_capacity_cpu_cores) by (node, %s)[%s:%dm]%s) *
|
|
|
- avg(avg_over_time(node_cpu_hourly_cost[%s:%dm]%s)) by (node, %s) * %f
|
|
|
+ sum_over_time(avg(kube_node_status_capacity_cpu_cores{%s}) by (node, %s)[%s:%dm]%s) *
|
|
|
+ avg(avg_over_time(node_cpu_hourly_cost{%s}[%s:%dm]%s)) by (node, %s) * %f
|
|
|
) by (%s)
|
|
|
`
|
|
|
|
|
|
const fmtQueryTotalRAM = `
|
|
|
sum(
|
|
|
- sum_over_time(avg(kube_node_status_capacity_memory_bytes) by (node, %s)[%s:%dm]%s) / 1024 / 1024 / 1024 *
|
|
|
- avg(avg_over_time(node_ram_hourly_cost[%s:%dm]%s)) by (node, %s) * %f
|
|
|
+ sum_over_time(avg(kube_node_status_capacity_memory_bytes{%s}) by (node, %s)[%s:%dm]%s) / 1024 / 1024 / 1024 *
|
|
|
+ avg(avg_over_time(node_ram_hourly_cost{%s}[%s:%dm]%s)) by (node, %s) * %f
|
|
|
) by (%s)
|
|
|
`
|
|
|
|
|
|
const fmtQueryTotalStorage = `
|
|
|
sum(
|
|
|
- sum_over_time(avg(kube_persistentvolume_capacity_bytes) by (persistentvolume, %s)[%s:%dm]%s) / 1024 / 1024 / 1024 *
|
|
|
- avg(avg_over_time(pv_hourly_cost[%s:%dm]%s)) by (persistentvolume, %s) * %f
|
|
|
+ sum_over_time(avg(kube_persistentvolume_capacity_bytes{%s}) by (persistentvolume, %s)[%s:%dm]%s) / 1024 / 1024 / 1024 *
|
|
|
+ avg(avg_over_time(pv_hourly_cost{%s}[%s:%dm]%s)) by (persistentvolume, %s) * %f
|
|
|
) by (%s)
|
|
|
`
|
|
|
|
|
|
const fmtQueryCPUModePct = `
|
|
|
- sum(rate(node_cpu_seconds_total[%s]%s)) by (%s, mode) / ignoring(mode)
|
|
|
- group_left sum(rate(node_cpu_seconds_total[%s]%s)) by (%s)
|
|
|
+ sum(rate(node_cpu_seconds_total{%s}[%s]%s)) by (%s, mode) / ignoring(mode)
|
|
|
+ group_left sum(rate(node_cpu_seconds_total{%s}[%s]%s)) by (%s)
|
|
|
`
|
|
|
|
|
|
const fmtQueryRAMSystemPct = `
|
|
|
- sum(sum_over_time(container_memory_usage_bytes{container_name!="",namespace="kube-system"}[%s:%dm]%s)) by (%s)
|
|
|
- / sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (%s)
|
|
|
+ sum(sum_over_time(container_memory_usage_bytes{container_name!="",namespace="kube-system", %s}[%s:%dm]%s)) by (%s)
|
|
|
+ / sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm]%s)) by (%s)
|
|
|
`
|
|
|
|
|
|
const fmtQueryRAMUserPct = `
|
|
|
- sum(sum_over_time(kubecost_cluster_memory_working_set_bytes[%s:%dm]%s)) by (%s)
|
|
|
- / sum(sum_over_time(kube_node_status_capacity_memory_bytes[%s:%dm]%s)) by (%s)
|
|
|
+ sum(sum_over_time(kubecost_cluster_memory_working_set_bytes{%s}[%s:%dm]%s)) by (%s)
|
|
|
+ / sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm]%s)) by (%s)
|
|
|
`
|
|
|
|
|
|
// TODO niko/clustercost metric "kubelet_volume_stats_used_bytes" was deprecated in 1.12, then seems to have come back in 1.17
|
|
|
@@ -920,11 +936,11 @@ func (a *Accesses) ComputeClusterCosts(client prometheus.Client, provider models
|
|
|
|
|
|
fmtOffset := timeutil.DurationToPromOffsetString(offset)
|
|
|
|
|
|
- queryDataCount := fmt.Sprintf(fmtQueryDataCount, env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, minsPerResolution)
|
|
|
- queryTotalGPU := fmt.Sprintf(fmtQueryTotalGPU, windowStr, minsPerResolution, fmtOffset, hourlyToCumulative, env.GetPromClusterLabel())
|
|
|
- queryTotalCPU := fmt.Sprintf(fmtQueryTotalCPU, env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
|
|
|
- queryTotalRAM := fmt.Sprintf(fmtQueryTotalRAM, env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
|
|
|
- queryTotalStorage := fmt.Sprintf(fmtQueryTotalStorage, env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
|
|
|
+ queryDataCount := fmt.Sprintf(fmtQueryDataCount, env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, minsPerResolution)
|
|
|
+ queryTotalGPU := fmt.Sprintf(fmtQueryTotalGPU, env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, hourlyToCumulative, env.GetPromClusterLabel())
|
|
|
+ queryTotalCPU := fmt.Sprintf(fmtQueryTotalCPU, env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
|
|
|
+ queryTotalRAM := fmt.Sprintf(fmtQueryTotalRAM, env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
|
|
|
+ queryTotalStorage := fmt.Sprintf(fmtQueryTotalStorage, env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
|
|
|
|
|
|
ctx := prom.NewNamedContext(client, prom.ClusterContextName)
|
|
|
|
|
|
@@ -946,9 +962,9 @@ func (a *Accesses) ComputeClusterCosts(client prometheus.Client, provider models
|
|
|
}
|
|
|
|
|
|
if withBreakdown {
|
|
|
- queryCPUModePct := fmt.Sprintf(fmtQueryCPUModePct, windowStr, fmtOffset, env.GetPromClusterLabel(), windowStr, fmtOffset, env.GetPromClusterLabel())
|
|
|
- queryRAMSystemPct := fmt.Sprintf(fmtQueryRAMSystemPct, windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel())
|
|
|
- queryRAMUserPct := fmt.Sprintf(fmtQueryRAMUserPct, windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel())
|
|
|
+ queryCPUModePct := fmt.Sprintf(fmtQueryCPUModePct, env.GetPromClusterFilter(), windowStr, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterFilter(), windowStr, fmtOffset, env.GetPromClusterLabel())
|
|
|
+ queryRAMSystemPct := fmt.Sprintf(fmtQueryRAMSystemPct, env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel())
|
|
|
+ queryRAMUserPct := fmt.Sprintf(fmtQueryRAMUserPct, env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel())
|
|
|
|
|
|
bdResChs := ctx.QueryAll(
|
|
|
queryCPUModePct,
|
|
|
@@ -1220,10 +1236,10 @@ func ClusterCostsOverTime(cli prometheus.Client, provider models.Provider, start
|
|
|
|
|
|
fmtOffset := timeutil.DurationToPromOffsetString(offset)
|
|
|
|
|
|
- qCores := fmt.Sprintf(queryClusterCores, fmtWindow, fmtOffset, env.GetPromClusterLabel(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
- qRAM := fmt.Sprintf(queryClusterRAM, fmtWindow, fmtOffset, env.GetPromClusterLabel(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
- qStorage := fmt.Sprintf(queryStorage, fmtWindow, fmtOffset, env.GetPromClusterLabel(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterLabel(), localStorageQuery)
|
|
|
- qTotal := fmt.Sprintf(queryTotal, env.GetPromClusterLabel(), env.GetPromClusterLabel(), env.GetPromClusterLabel(), env.GetPromClusterLabel(), localStorageQuery)
|
|
|
+ qCores := fmt.Sprintf(queryClusterCores, env.GetPromClusterFilter(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
+ qRAM := fmt.Sprintf(queryClusterRAM, env.GetPromClusterFilter(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterLabel())
|
|
|
+ qStorage := fmt.Sprintf(queryStorage, env.GetPromClusterFilter(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, fmtOffset, env.GetPromClusterLabel(), env.GetPromClusterLabel(), localStorageQuery)
|
|
|
+ qTotal := fmt.Sprintf(queryTotal, env.GetPromClusterFilter(), env.GetPromClusterLabel(), env.GetPromClusterFilter(), env.GetPromClusterLabel(), env.GetPromClusterFilter(), env.GetPromClusterLabel(), env.GetPromClusterLabel(), localStorageQuery)
|
|
|
|
|
|
ctx := prom.NewNamedContext(cli, prom.ClusterContextName)
|
|
|
resChClusterCores := ctx.QueryRange(qCores, start, end, window)
|
|
|
@@ -1273,7 +1289,7 @@ func ClusterCostsOverTime(cli prometheus.Client, provider models.Provider, start
|
|
|
// If clusterTotal query failed, it's likely because there are no PVs, which
|
|
|
// causes the qTotal query to return no data. Instead, query only node costs.
|
|
|
// If that fails, return an error because something is actually wrong.
|
|
|
- qNodes := fmt.Sprintf(queryNodes, env.GetPromClusterLabel(), localStorageQuery)
|
|
|
+ qNodes := fmt.Sprintf(queryNodes, env.GetPromClusterFilter(), env.GetPromClusterLabel(), localStorageQuery)
|
|
|
|
|
|
resultNodes, warnings, err := ctx.QueryRangeSync(qNodes, start, end, window)
|
|
|
for _, warning := range warnings {
|