|
|
@@ -178,33 +178,27 @@ const (
|
|
|
sum(kube_persistentvolumeclaim_resource_requests_storage_bytes) by (persistentvolumeclaim, namespace, cluster_id, kubernetes_name)) by (persistentvolumeclaim, storageclass, namespace, volumename, cluster_id)`
|
|
|
// queryRAMAllocationByteHours yields the total byte-hour RAM allocation over the given
|
|
|
// window, aggregated by container.
|
|
|
- // [line 3] sum(all byte measurements) = [byte*scrape] by metric
|
|
|
- // [lines 4-6] (") / (approximate scrape count per hour) = [byte*hour] by metric
|
|
|
- // [lines 2,7] sum(") by unique container key = [byte*hour] by container
|
|
|
- // [lines 1,8] relabeling
|
|
|
- queryRAMAllocationByteHours = `label_replace(label_replace(
|
|
|
- sum(
|
|
|
- sum_over_time(container_memory_allocation_bytes{container!="",container!="POD", node!=""}[%s])
|
|
|
- / clamp_min(
|
|
|
- count_over_time(container_memory_allocation_bytes{container!="",container!="POD", node!=""}[%s])/%f,
|
|
|
- scalar(avg(avg_over_time(prometheus_target_interval_length_seconds[%s])))*%f)
|
|
|
- ) by (namespace,container,pod,node,cluster_id)
|
|
|
- , "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
|
|
|
+ // [line 3] sum_over_time(each byte*min in window) / (min/hr kubecost up) = [byte*hour] by metric, adjusted for kubecost downtime
|
|
|
+ // [lines 2,4] sum(") by unique container key = [byte*hour] by container
|
|
|
+ // [lines 1,5] relabeling
|
|
|
+ queryRAMAllocationByteHours = `
|
|
|
+ label_replace(label_replace(
|
|
|
+ sum(
|
|
|
+ sum_over_time(container_memory_allocation_bytes{container!="",container!="POD", node!=""}[%s:1m]) / %f
|
|
|
+ ) by (namespace,container,pod,node,cluster_id)
|
|
|
+ , "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
|
|
|
// queryCPUAllocationVCPUHours yields the total VCPU-hour CPU allocation over the given
|
|
|
// window, aggregated by container.
|
|
|
- // [line 3] sum(all VCPU measurements within given window) = [VCPU*scrape] by metric
|
|
|
- // [lines 4-6] (") / (approximate scrape count per hour) = [VCPU*hour] by metric
|
|
|
- // [lines 2,7] sum(") by unique container key = [VCPU*hour] by container
|
|
|
- // [lines 1,8] relabeling
|
|
|
- queryCPUAllocationVCPUHours = `label_replace(label_replace(
|
|
|
- sum(
|
|
|
- sum_over_time(container_cpu_allocation{container!="",container!="POD", node!=""}[%s])
|
|
|
- / clamp_min(
|
|
|
- count_over_time(container_cpu_allocation{container!="",container!="POD", node!=""}[%s])/%f,
|
|
|
- scalar(avg(avg_over_time(prometheus_target_interval_length_seconds[%s])))*%f)
|
|
|
- ) by (namespace,container,pod,node,cluster_id)
|
|
|
- , "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
|
|
|
- // queryPVCAllocationFmt yields the total byte-hour CPU allocation over the given window.
|
|
|
+ // [line 3] sum_over_time(each VCPU*mins in window) / (min/hr kubecost up) = [VCPU*hour] by metric, adjusted for kubecost downtime
|
|
|
+ // [lines 2,4] sum(") by unique container key = [VCPU*hour] by container
|
|
|
+ // [lines 1,5] relabeling
|
|
|
+ queryCPUAllocationVCPUHours = `
|
|
|
+ label_replace(label_replace(
|
|
|
+ sum(
|
|
|
+ sum_over_time(container_cpu_allocation{container!="",container!="POD", node!=""}[%s:1m]) / %f
|
|
|
+ ) by (namespace,container,pod,node,cluster_id)
|
|
|
+ , "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
|
|
|
+ // queryPVCAllocationFmt yields the total byte-hour PVC allocation over the given window.
|
|
|
// sum(all VCPU measurements within given window) = [byte*min] by metric
|
|
|
// (") / 60 = [byte*hour] by metric, assuming no missed scrapes
|
|
|
// (") * (normalization factor) = [byte*hour] by metric, normalized for missed scrapes
|
|
|
@@ -226,6 +220,7 @@ const (
|
|
|
queryRegionNetworkUsage = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="false"}[%s] %s)) by (namespace,pod_name,cluster_id) / 1024 / 1024 / 1024`
|
|
|
queryInternetNetworkUsage = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true"}[%s] %s)) by (namespace,pod_name,cluster_id) / 1024 / 1024 / 1024`
|
|
|
normalizationStr = `max(count_over_time(kube_pod_container_resource_requests_memory_bytes{}[%s] %s))`
|
|
|
+ kubecostUpMinsPerHourStr = `max(count_over_time(node_cpu_hourly_cost[%s:1m])) / %f`
|
|
|
)
|
|
|
|
|
|
type PrometheusMetadata struct {
|
|
|
@@ -1492,28 +1487,7 @@ func (cm *CostModel) ComputeCostDataRange(cli prometheusClient.Client, clientset
|
|
|
return data, err
|
|
|
}
|
|
|
|
|
|
-func (cm *CostModel) costDataRange(cli prometheusClient.Client, clientset kubernetes.Interface, cp costAnalyzerCloud.Provider,
|
|
|
- startString, endString, windowString string, resolutionHours float64, filterNamespace string, filterCluster string, remoteEnabled bool) (map[string]*CostData, error) {
|
|
|
-
|
|
|
- // Use a heuristic to tell the difference between missed scrapes and an incomplete window
|
|
|
- // of data due to fresh install, etc.
|
|
|
- minimumExpectedScrapeRate := 0.95
|
|
|
-
|
|
|
- queryRAMAlloc := fmt.Sprintf(queryRAMAllocationByteHours, windowString, windowString, resolutionHours, windowString, minimumExpectedScrapeRate)
|
|
|
- queryCPUAlloc := fmt.Sprintf(queryCPUAllocationVCPUHours, windowString, windowString, resolutionHours, windowString, minimumExpectedScrapeRate)
|
|
|
- queryRAMRequests := fmt.Sprintf(queryRAMRequestsStr, windowString, "", windowString, "")
|
|
|
- queryRAMUsage := fmt.Sprintf(queryRAMUsageStr, windowString, "", windowString, "")
|
|
|
- queryCPURequests := fmt.Sprintf(queryCPURequestsStr, windowString, "", windowString, "")
|
|
|
- queryCPUUsage := fmt.Sprintf(queryCPUUsageStr, windowString, "")
|
|
|
- queryGPURequests := fmt.Sprintf(queryGPURequestsStr, windowString, "", windowString, "", resolutionHours, windowString, "")
|
|
|
- queryPVRequests := fmt.Sprintf(queryPVRequestsStr)
|
|
|
- queryPVCAllocation := fmt.Sprintf(queryPVCAllocationFmt, windowString, windowString, resolutionHours, minimumExpectedScrapeRate)
|
|
|
- queryPVHourlyCost := fmt.Sprintf(queryPVHourlyCostFmt, windowString)
|
|
|
- queryNetZoneRequests := fmt.Sprintf(queryZoneNetworkUsage, windowString, "")
|
|
|
- queryNetRegionRequests := fmt.Sprintf(queryRegionNetworkUsage, windowString, "")
|
|
|
- queryNetInternetRequests := fmt.Sprintf(queryInternetNetworkUsage, windowString, "")
|
|
|
- queryNormalization := fmt.Sprintf(normalizationStr, windowString, "")
|
|
|
-
|
|
|
+func (cm *CostModel) costDataRange(cli prometheusClient.Client, clientset kubernetes.Interface, cp costAnalyzerCloud.Provider, startString, endString, windowString string, resolutionHours float64, filterNamespace string, filterCluster string, remoteEnabled bool) (map[string]*CostData, error) {
|
|
|
layout := "2006-01-02T15:04:05.000Z"
|
|
|
|
|
|
start, err := time.Parse(layout, startString)
|
|
|
@@ -1521,16 +1495,19 @@ func (cm *CostModel) costDataRange(cli prometheusClient.Client, clientset kubern
|
|
|
klog.V(1).Infof("Error parsing time " + startString + ". Error: " + err.Error())
|
|
|
return nil, err
|
|
|
}
|
|
|
+
|
|
|
end, err := time.Parse(layout, endString)
|
|
|
if err != nil {
|
|
|
klog.V(1).Infof("Error parsing time " + endString + ". Error: " + err.Error())
|
|
|
return nil, err
|
|
|
}
|
|
|
+
|
|
|
window, err := time.ParseDuration(windowString)
|
|
|
if err != nil {
|
|
|
klog.V(1).Infof("Error parsing time " + windowString + ". Error: " + err.Error())
|
|
|
return nil, err
|
|
|
}
|
|
|
+
|
|
|
clusterID := env.GetClusterID()
|
|
|
|
|
|
durHrs := end.Sub(start).Hours() + 1
|
|
|
@@ -1543,10 +1520,59 @@ func (cm *CostModel) costDataRange(cli prometheusClient.Client, clientset kubern
|
|
|
return CostDataRangeFromSQL("", "", windowString, remoteStartStr, remoteEndStr)
|
|
|
}
|
|
|
|
|
|
+ ctx := prom.NewContext(cli)
|
|
|
+
|
|
|
+ // Query for the average number of minutes per hour that Kubecost was up
|
|
|
+ // in the given range by averaging the number of up minutes-per-hour for
|
|
|
+ // each window in the range. Use that number in the RAM and CPU allocation
|
|
|
+ // queries as the adjutsment factor, scaling only if Kubecost was down
|
|
|
+ // for fewer than 3 minutes (as a heuristic for a reasonable amount of
|
|
|
+ // time to interpolate). Otherwise, use 60 minutes per hour and assume
|
|
|
+ // that this period of time is during Kubecost start-up or a long-term
|
|
|
+ // downtime for which we don't want to interpolate.
|
|
|
+ queryKubecostUpMinsPerHour := fmt.Sprintf(kubecostUpMinsPerHourStr, windowString, window.Hours())
|
|
|
+ resKubecostUp, err := ctx.QueryRangeSync(queryKubecostUpMinsPerHour, start, end, window)
|
|
|
+ if err != nil {
|
|
|
+ log.Errorf("costDataRange: error querying Kubecost up: %s", err)
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+
|
|
|
+ kubecostMinsPerHour := 0.0
|
|
|
+ num := 0
|
|
|
+ if len(resKubecostUp) > 0 {
|
|
|
+ for _, val := range resKubecostUp[0].Values {
|
|
|
+ kubecostMinsPerHour += val.Value
|
|
|
+ num++
|
|
|
+ }
|
|
|
+ kubecostMinsPerHour /= float64(num)
|
|
|
+ }
|
|
|
+ if kubecostMinsPerHour <= 57.0 {
|
|
|
+ kubecostMinsPerHour = 60.0
|
|
|
+ }
|
|
|
+
|
|
|
+ // TODO niko/queryfix rewrite PVCAllocation query too, and remove this
|
|
|
+ // Use a heuristic to tell the difference between missed scrapes and an incomplete window
|
|
|
+ // of data due to fresh install, etc.
|
|
|
+ minimumExpectedScrapeRate := 0.95
|
|
|
+
|
|
|
+ queryRAMAlloc := fmt.Sprintf(queryRAMAllocationByteHours, windowString, kubecostMinsPerHour)
|
|
|
+ queryCPUAlloc := fmt.Sprintf(queryCPUAllocationVCPUHours, windowString, kubecostMinsPerHour)
|
|
|
+ queryRAMRequests := fmt.Sprintf(queryRAMRequestsStr, windowString, "", windowString, "")
|
|
|
+ queryRAMUsage := fmt.Sprintf(queryRAMUsageStr, windowString, "", windowString, "")
|
|
|
+ queryCPURequests := fmt.Sprintf(queryCPURequestsStr, windowString, "", windowString, "")
|
|
|
+ queryCPUUsage := fmt.Sprintf(queryCPUUsageStr, windowString, "")
|
|
|
+ queryGPURequests := fmt.Sprintf(queryGPURequestsStr, windowString, "", windowString, "", resolutionHours, windowString, "")
|
|
|
+ queryPVRequests := fmt.Sprintf(queryPVRequestsStr)
|
|
|
+ queryPVCAllocation := fmt.Sprintf(queryPVCAllocationFmt, windowString, windowString, resolutionHours, minimumExpectedScrapeRate)
|
|
|
+ queryPVHourlyCost := fmt.Sprintf(queryPVHourlyCostFmt, windowString)
|
|
|
+ queryNetZoneRequests := fmt.Sprintf(queryZoneNetworkUsage, windowString, "")
|
|
|
+ queryNetRegionRequests := fmt.Sprintf(queryRegionNetworkUsage, windowString, "")
|
|
|
+ queryNetInternetRequests := fmt.Sprintf(queryInternetNetworkUsage, windowString, "")
|
|
|
+ queryNormalization := fmt.Sprintf(normalizationStr, windowString, "")
|
|
|
+
|
|
|
queryProfileStart := time.Now()
|
|
|
|
|
|
// Submit all queries for concurrent evaluation
|
|
|
- ctx := prom.NewContext(cli)
|
|
|
resChRAMRequests := ctx.QueryRange(queryRAMRequests, start, end, window)
|
|
|
resChRAMUsage := ctx.QueryRange(queryRAMUsage, start, end, window)
|
|
|
resChRAMAlloc := ctx.QueryRange(queryRAMAlloc, start, end, window)
|