5 лет назад · 2d3fa4a645
--- a/pkg/costmodel/costmodel.go
+++ b/pkg/costmodel/costmodel.go
@@ -178,33 +178,27 @@ const (
 
				 	sum(kube_persistentvolumeclaim_resource_requests_storage_bytes) by (persistentvolumeclaim, namespace, cluster_id, kubernetes_name)) by (persistentvolumeclaim, storageclass, namespace, volumename, cluster_id)`
			
 
				 	// queryRAMAllocationByteHours yields the total byte-hour RAM allocation over the given
			
 
				 	// window, aggregated by container.
			
 
				-	//  [line 3]     sum(all byte measurements) = [byte*scrape] by metric
			
 
				-	//  [lines 4-6]  (") / (approximate scrape count per hour) = [byte*hour] by metric
			
 
				-	//  [lines 2,7]     sum(") by unique container key = [byte*hour] by container
			
 
				-	//  [lines 1,8]  relabeling
			
 
				-	queryRAMAllocationByteHours = `label_replace(label_replace(
			
 
				-		sum(
			
 
				-			sum_over_time(container_memory_allocation_bytes{container!="",container!="POD", node!=""}[%s])
			
 
				-			/ clamp_min(
			
 
				-				count_over_time(container_memory_allocation_bytes{container!="",container!="POD", node!=""}[%s])/%f,
			
 
				-				scalar(avg(avg_over_time(prometheus_target_interval_length_seconds[%s])))*%f)
			
 
				-		) by (namespace,container,pod,node,cluster_id)
			
 
				-	, "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
			
 
				+	//  [line 3]     sum_over_time(each byte*min in window) / (min/hr kubecost up) = [byte*hour] by metric, adjusted for kubecost downtime
			
 
				+	//  [lines 2,4]  sum(") by unique container key = [byte*hour] by container
			
 
				+	//  [lines 1,5]  relabeling
			
 
				+	queryRAMAllocationByteHours = `
			
 
				+		label_replace(label_replace(
			
 
				+			sum(
			
 
				+				sum_over_time(container_memory_allocation_bytes{container!="",container!="POD", node!=""}[%s:1m]) / %f 
			
 
				+			) by (namespace,container,pod,node,cluster_id)
			
 
				+		, "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
			
 
				 	// queryCPUAllocationVCPUHours yields the total VCPU-hour CPU allocation over the given
			
 
				 	// window, aggregated by container.
			
 
				-	//  [line 3]     sum(all VCPU measurements within given window) = [VCPU*scrape] by metric
			
 
				-	//  [lines 4-6]  (") / (approximate scrape count per hour) = [VCPU*hour] by metric
			
 
				-	//  [lines 2,7]     sum(") by unique container key = [VCPU*hour] by container
			
 
				-	//  [lines 1,8]  relabeling
			
 
				-	queryCPUAllocationVCPUHours = `label_replace(label_replace(
			
 
				-		sum(
			
 
				-			sum_over_time(container_cpu_allocation{container!="",container!="POD", node!=""}[%s])
			
 
				-			/ clamp_min(
			
 
				-				count_over_time(container_cpu_allocation{container!="",container!="POD", node!=""}[%s])/%f,
			
 
				-				scalar(avg(avg_over_time(prometheus_target_interval_length_seconds[%s])))*%f)
			
 
				-		) by (namespace,container,pod,node,cluster_id)
			
 
				-	, "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
			
 
				-	// queryPVCAllocationFmt yields the total byte-hour CPU allocation over the given window.
			
 
				+	//  [line 3]     sum_over_time(each VCPU*mins in window) / (min/hr kubecost up) = [VCPU*hour] by metric, adjusted for kubecost downtime
			
 
				+	//  [lines 2,4]  sum(") by unique container key = [VCPU*hour] by container
			
 
				+	//  [lines 1,5]  relabeling
			
 
				+	queryCPUAllocationVCPUHours = `
			
 
				+		label_replace(label_replace(
			
 
				+			sum(
			
 
				+				sum_over_time(container_cpu_allocation{container!="",container!="POD", node!=""}[%s:1m]) / %f
			
 
				+			) by (namespace,container,pod,node,cluster_id)
			
 
				+		, "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
			
 
				+	// queryPVCAllocationFmt yields the total byte-hour PVC allocation over the given window.
			
 
				 	//  sum(all VCPU measurements within given window) = [byte*min] by metric
			
 
				 	//  (") / 60 = [byte*hour] by metric, assuming no missed scrapes
			
 
				 	//  (") * (normalization factor) = [byte*hour] by metric, normalized for missed scrapes
			
@@ -226,6 +220,7 @@ const (
 
				 	queryRegionNetworkUsage   = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="false"}[%s] %s)) by (namespace,pod_name,cluster_id) / 1024 / 1024 / 1024`
			
 
				 	queryInternetNetworkUsage = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true"}[%s] %s)) by (namespace,pod_name,cluster_id) / 1024 / 1024 / 1024`
			
 
				 	normalizationStr          = `max(count_over_time(kube_pod_container_resource_requests_memory_bytes{}[%s] %s))`
			
 
				+	kubecostUpMinsPerHourStr  = `max(count_over_time(node_cpu_hourly_cost[%s:1m])) / %f`
			
 
				 )
			
 
				 
			
 
				 type PrometheusMetadata struct {
			
@@ -1492,28 +1487,7 @@ func (cm *CostModel) ComputeCostDataRange(cli prometheusClient.Client, clientset
 
				 	return data, err
			
 
				 }
			
 
				 
			
 
				-func (cm *CostModel) costDataRange(cli prometheusClient.Client, clientset kubernetes.Interface, cp costAnalyzerCloud.Provider,
			
 
				-	startString, endString, windowString string, resolutionHours float64, filterNamespace string, filterCluster string, remoteEnabled bool) (map[string]*CostData, error) {
			
 
				-
			
 
				-	// Use a heuristic to tell the difference between missed scrapes and an incomplete window
			
 
				-	// of data due to fresh install, etc.
			
 
				-	minimumExpectedScrapeRate := 0.95
			
 
				-
			
 
				-	queryRAMAlloc := fmt.Sprintf(queryRAMAllocationByteHours, windowString, windowString, resolutionHours, windowString, minimumExpectedScrapeRate)
			
 
				-	queryCPUAlloc := fmt.Sprintf(queryCPUAllocationVCPUHours, windowString, windowString, resolutionHours, windowString, minimumExpectedScrapeRate)
			
 
				-	queryRAMRequests := fmt.Sprintf(queryRAMRequestsStr, windowString, "", windowString, "")
			
 
				-	queryRAMUsage := fmt.Sprintf(queryRAMUsageStr, windowString, "", windowString, "")
			
 
				-	queryCPURequests := fmt.Sprintf(queryCPURequestsStr, windowString, "", windowString, "")
			
 
				-	queryCPUUsage := fmt.Sprintf(queryCPUUsageStr, windowString, "")
			
 
				-	queryGPURequests := fmt.Sprintf(queryGPURequestsStr, windowString, "", windowString, "", resolutionHours, windowString, "")
			
 
				-	queryPVRequests := fmt.Sprintf(queryPVRequestsStr)
			
 
				-	queryPVCAllocation := fmt.Sprintf(queryPVCAllocationFmt, windowString, windowString, resolutionHours, minimumExpectedScrapeRate)
			
 
				-	queryPVHourlyCost := fmt.Sprintf(queryPVHourlyCostFmt, windowString)
			
 
				-	queryNetZoneRequests := fmt.Sprintf(queryZoneNetworkUsage, windowString, "")
			
 
				-	queryNetRegionRequests := fmt.Sprintf(queryRegionNetworkUsage, windowString, "")
			
 
				-	queryNetInternetRequests := fmt.Sprintf(queryInternetNetworkUsage, windowString, "")
			
 
				-	queryNormalization := fmt.Sprintf(normalizationStr, windowString, "")
			
 
				-
			
 
				+func (cm *CostModel) costDataRange(cli prometheusClient.Client, clientset kubernetes.Interface, cp costAnalyzerCloud.Provider, startString, endString, windowString string, resolutionHours float64, filterNamespace string, filterCluster string, remoteEnabled bool) (map[string]*CostData, error) {
			
 
				 	layout := "2006-01-02T15:04:05.000Z"
			
 
				 
			
 
				 	start, err := time.Parse(layout, startString)
			
@@ -1521,16 +1495,19 @@ func (cm *CostModel) costDataRange(cli prometheusClient.Client, clientset kubern
 
				 		klog.V(1).Infof("Error parsing time " + startString + ". Error: " + err.Error())
			
 
				 		return nil, err
			
 
				 	}
			
 
				+
			
 
				 	end, err := time.Parse(layout, endString)
			
 
				 	if err != nil {
			
 
				 		klog.V(1).Infof("Error parsing time " + endString + ". Error: " + err.Error())
			
 
				 		return nil, err
			
 
				 	}
			
 
				+
			
 
				 	window, err := time.ParseDuration(windowString)
			
 
				 	if err != nil {
			
 
				 		klog.V(1).Infof("Error parsing time " + windowString + ". Error: " + err.Error())
			
 
				 		return nil, err
			
 
				 	}
			
 
				+
			
 
				 	clusterID := env.GetClusterID()
			
 
				 
			
 
				 	durHrs := end.Sub(start).Hours() + 1
			
@@ -1543,10 +1520,59 @@ func (cm *CostModel) costDataRange(cli prometheusClient.Client, clientset kubern
 
				 		return CostDataRangeFromSQL("", "", windowString, remoteStartStr, remoteEndStr)
			
 
				 	}
			
 
				 
			
 
				+	ctx := prom.NewContext(cli)
			
 
				+
			
 
				+	// Query for the average number of minutes per hour that Kubecost was up
			
 
				+	// in the given range by averaging the number of up minutes-per-hour for
			
 
				+	// each window in the range. Use that number in the RAM and CPU allocation
			
 
				+	// queries as the adjutsment factor, scaling only if Kubecost was down
			
 
				+	// for fewer than 3 minutes (as a heuristic for a reasonable amount of
			
 
				+	// time to interpolate). Otherwise, use 60 minutes per hour and assume
			
 
				+	// that this period of time is during Kubecost start-up or a long-term
			
 
				+	// downtime for which we don't want to interpolate.
			
 
				+	queryKubecostUpMinsPerHour := fmt.Sprintf(kubecostUpMinsPerHourStr, windowString, window.Hours())
			
 
				+	resKubecostUp, err := ctx.QueryRangeSync(queryKubecostUpMinsPerHour, start, end, window)
			
 
				+	if err != nil {
			
 
				+		log.Errorf("costDataRange: error querying Kubecost up: %s", err)
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	kubecostMinsPerHour := 0.0
			
 
				+	num := 0
			
 
				+	if len(resKubecostUp) > 0 {
			
 
				+		for _, val := range resKubecostUp[0].Values {
			
 
				+			kubecostMinsPerHour += val.Value
			
 
				+			num++
			
 
				+		}
			
 
				+		kubecostMinsPerHour /= float64(num)
			
 
				+	}
			
 
				+	if kubecostMinsPerHour <= 57.0 {
			
 
				+		kubecostMinsPerHour = 60.0
			
 
				+	}
			
 
				+
			
 
				+	// TODO niko/queryfix rewrite PVCAllocation query too, and remove this
			
 
				+	// Use a heuristic to tell the difference between missed scrapes and an incomplete window
			
 
				+	// of data due to fresh install, etc.
			
 
				+	minimumExpectedScrapeRate := 0.95
			
 
				+
			
 
				+	queryRAMAlloc := fmt.Sprintf(queryRAMAllocationByteHours, windowString, kubecostMinsPerHour)
			
 
				+	queryCPUAlloc := fmt.Sprintf(queryCPUAllocationVCPUHours, windowString, kubecostMinsPerHour)
			
 
				+	queryRAMRequests := fmt.Sprintf(queryRAMRequestsStr, windowString, "", windowString, "")
			
 
				+	queryRAMUsage := fmt.Sprintf(queryRAMUsageStr, windowString, "", windowString, "")
			
 
				+	queryCPURequests := fmt.Sprintf(queryCPURequestsStr, windowString, "", windowString, "")
			
 
				+	queryCPUUsage := fmt.Sprintf(queryCPUUsageStr, windowString, "")
			
 
				+	queryGPURequests := fmt.Sprintf(queryGPURequestsStr, windowString, "", windowString, "", resolutionHours, windowString, "")
			
 
				+	queryPVRequests := fmt.Sprintf(queryPVRequestsStr)
			
 
				+	queryPVCAllocation := fmt.Sprintf(queryPVCAllocationFmt, windowString, windowString, resolutionHours, minimumExpectedScrapeRate)
			
 
				+	queryPVHourlyCost := fmt.Sprintf(queryPVHourlyCostFmt, windowString)
			
 
				+	queryNetZoneRequests := fmt.Sprintf(queryZoneNetworkUsage, windowString, "")
			
 
				+	queryNetRegionRequests := fmt.Sprintf(queryRegionNetworkUsage, windowString, "")
			
 
				+	queryNetInternetRequests := fmt.Sprintf(queryInternetNetworkUsage, windowString, "")
			
 
				+	queryNormalization := fmt.Sprintf(normalizationStr, windowString, "")
			
 
				+
			
 
				 	queryProfileStart := time.Now()
			
 
				 
			
 
				 	// Submit all queries for concurrent evaluation
			
 
				-	ctx := prom.NewContext(cli)
			
 
				 	resChRAMRequests := ctx.QueryRange(queryRAMRequests, start, end, window)
			
 
				 	resChRAMUsage := ctx.QueryRange(queryRAMUsage, start, end, window)
			
 
				 	resChRAMAlloc := ctx.QueryRange(queryRAMAlloc, start, end, window)