Ver Fonte

Merge branch 'develop' into revert-1799-arm64-docker-image

Ajay Tripathy há 3 anos atrás
pai
commit
323fd42365
2 ficheiros alterados com 56 adições e 3 exclusões
  1. 45 3
      pkg/costmodel/allocation.go
  2. 11 0
      pkg/prom/diagnostics.go

+ 45 - 3
pkg/costmodel/allocation.go

@@ -22,7 +22,6 @@ const (
 	queryFmtCPUCoresAllocated        = `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtCPURequests              = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtCPUUsageAvg              = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD"}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
-	queryFmtCPUUsageMax              = `max(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD"}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
 	queryFmtGPUsRequested            = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtGPUsAllocated            = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
 	queryFmtNodeCostPerCPUHr         = `avg(avg_over_time(node_cpu_hourly_cost[%s])) by (node, %s, instance_type, provider_id)`
@@ -57,6 +56,32 @@ const (
 	queryFmtReplicaSetsWithoutOwners = `avg(avg_over_time(kube_replicaset_owner{owner_kind="<none>", owner_name="<none>"}[%s])) by (replicaset, namespace, %s)`
 	queryFmtLBCostPerHr              = `avg(avg_over_time(kubecost_load_balancer_cost[%s])) by (namespace, service_name, %s)`
 	queryFmtLBActiveMins             = `count(kubecost_load_balancer_cost) by (namespace, service_name, %s)[%s:%s]`
+
+	// Because we use container_cpu_usage_seconds_total to calculate CPU usage
+	// at any given "instant" of time, we need to use an irate or rate. To then
+	// calculate a max (or any aggregation) we have to perform an aggregation
+	// query on top of an instant-by-instant maximum. Prometheus supports this
+	// type of query with a "subquery" [1], however it is reportedly expensive
+	// to make such a query. By default, Kubecost's Prometheus config includes
+	// a recording rule that keeps track of the instant-by-instant irate for CPU
+	// usage. The metric in this query is created by that recording rule.
+	//
+	// [1] https://prometheus.io/blog/2019/01/28/subquery-support/
+	//
+	// If changing the name of the recording rule, make sure to update the
+	// corresponding diagnostic query to avoid confusion.
+	queryFmtCPUUsageMaxRecordingRule = `max(max_over_time(kubecost_container_cpu_usage_irate{}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
+	// This is the subquery equivalent of the above recording rule query. It is
+	// more expensive, but does not require the recording rule. It should be
+	// used as a fallback query if the recording rule data does not exist.
+	//
+	// The parameter after the colon [:<thisone>] in the subquery affects the
+	// resolution of the subquery.
+	// The parameter after the metric ...{}[<thisone>] should be set to 2x
+	// the resolution, to make sure the irate always has two points to query
+	// in case the Prom scrape duration has been reduced to be equal to the
+	// ETL resolution.
+	queryFmtCPUUsageMaxSubquery = `max(max_over_time(irate(container_cpu_usage_seconds_total{container_name!="POD", container_name!=""}[%s])[%s:%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
 )
 
 // Constants for Network Cost Subtype
@@ -338,8 +363,26 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	queryCPUUsageAvg := fmt.Sprintf(queryFmtCPUUsageAvg, durStr, env.GetPromClusterLabel())
 	resChCPUUsageAvg := ctx.QueryAtTime(queryCPUUsageAvg, end)
 
-	queryCPUUsageMax := fmt.Sprintf(queryFmtCPUUsageMax, durStr, env.GetPromClusterLabel())
+	queryCPUUsageMax := fmt.Sprintf(queryFmtCPUUsageMaxRecordingRule, durStr, env.GetPromClusterLabel())
 	resChCPUUsageMax := ctx.QueryAtTime(queryCPUUsageMax, end)
+	resCPUUsageMax, _ := resChCPUUsageMax.Await()
+	// If the recording rule has no data, try to fall back to the subquery.
+	if len(resCPUUsageMax) == 0 {
+		// The parameter after the metric ...{}[<thisone>] should be set to 2x
+		// the resolution, to make sure the irate always has two points to query
+		// in case the Prom scrape duration has been reduced to be equal to the
+		// resolution.
+		doubleResStr := timeutil.DurationString(2 * resolution)
+		queryCPUUsageMax = fmt.Sprintf(queryFmtCPUUsageMaxSubquery, doubleResStr, durStr, resStr, env.GetPromClusterLabel())
+		resChCPUUsageMax = ctx.QueryAtTime(queryCPUUsageMax, end)
+		resCPUUsageMax, _ = resChCPUUsageMax.Await()
+
+		// This avoids logspam if there is no data for either metric (e.g. if
+		// the Prometheus didn't exist in the queried window of time).
+		if len(resCPUUsageMax) > 0 {
+			log.Debugf("CPU usage recording rule query returned an empty result when queried at %s over %s. Fell back to subquery. Consider setting up Kubecost CPU usage recording role to reduce query load on Prometheus; subqueries are expensive.", end.String(), durStr)
+		}
+	}
 
 	queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, durStr, env.GetPromClusterLabel())
 	resChGPUsRequested := ctx.QueryAtTime(queryGPUsRequested, end)
@@ -449,7 +492,6 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	resCPUCoresAllocated, _ := resChCPUCoresAllocated.Await()
 	resCPURequests, _ := resChCPURequests.Await()
 	resCPUUsageAvg, _ := resChCPUUsageAvg.Await()
-	resCPUUsageMax, _ := resChCPUUsageMax.Await()
 	resRAMBytesAllocated, _ := resChRAMBytesAllocated.Await()
 	resRAMRequests, _ := resChRAMRequests.Await()
 	resRAMUsageAvg, _ := resChRAMUsageAvg.Await()

+ 11 - 0
pkg/prom/diagnostics.go

@@ -36,6 +36,10 @@ const (
 	// CPUThrottlingDiagnosticMetricID is the identifier for the metric used to determine if CPU throttling is being applied to the
 	// cost-model container.
 	CPUThrottlingDiagnosticMetricID = "cpuThrottling"
+
+	// KubecostRecordingRuleCPUUsageID is the identifier for the query used to
+	// determine of the CPU usage recording rule is set up correctly.
+	KubecostRecordingRuleCPUUsageID = "kubecostRecordingRuleCPUUsage"
 )
 
 const DocumentationBaseURL = "https://github.com/kubecost/docs/blob/master/diagnostics.md"
@@ -96,6 +100,13 @@ var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnos
 		Label:       "Kubecost is not CPU throttled",
 		Description: "Kubecost loading slowly? A kubecost component might be CPU throttled",
 	},
+	KubecostRecordingRuleCPUUsageID: {
+		ID:          KubecostRecordingRuleCPUUsageID,
+		QueryFmt:    `absent_over_time(kubecost_container_cpu_usage_irate[5m] %s)`,
+		Label:       "Kubecost's CPU usage recording rule is set up",
+		Description: "If the 'kubecost_container_cpu_usage_irate' recording rule is not set up, Allocation pipeline build may put pressure on your Prometheus due to the use of a subquery.",
+		DocLink:     "https://docs.kubecost.com/install-and-configure/install/custom-prom",
+	},
 }
 
 // QueuedPromRequest is a representation of a request waiting to be sent by the prometheus