Просмотр исходного кода

Merge branch 'develop' into sean/cloudcost-view-labels

Sean Holcomb 2 лет назад
Родитель
Сommit
e588f9fba4
3 измененных файлов с 50 добавлено и 44 удалено
  1. 5 6
      pkg/costmodel/costmodel.go
  2. 44 37
      pkg/costmodel/metrics.go
  3. 1 1
      ui/Dockerfile.cross

+ 5 - 6
pkg/costmodel/costmodel.go

@@ -31,6 +31,8 @@ const (
 
 	profileThreshold = 1000 * 1000 * 1000 // 1s (in ns)
 
+	unmountedPVsContainer = "unmounted-pvs"
+
 	apiPrefix         = "/api/v1"
 	epAlertManagers   = apiPrefix + "/alertmanagers"
 	epLabelValues     = apiPrefix + "/label/:name/values"
@@ -724,16 +726,13 @@ func findUnmountedPVCostData(clusterMap clusters.ClusterMap, unmountedPVs map[st
 
 		namespaceAnnotations, _ := namespaceAnnotationsMapping[ns+","+clusterID]
 
-		// Should be a unique "Unmounted" cost data type
-		name := "unmounted-pvs"
-
-		metric := NewContainerMetricFromValues(ns, name, name, "", clusterID)
+		metric := NewContainerMetricFromValues(ns, unmountedPVsContainer, unmountedPVsContainer, "", clusterID)
 		key := metric.Key()
 
 		if costData, ok := costs[key]; !ok {
 			costs[key] = &CostData{
-				Name:            name,
-				PodName:         name,
+				Name:            unmountedPVsContainer,
+				PodName:         unmountedPVsContainer,
 				NodeName:        "",
 				Annotations:     namespaceAnnotations,
 				Namespace:       ns,

+ 44 - 37
pkg/costmodel/metrics.go

@@ -421,7 +421,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 			var ok bool
 			defaultRegion, ok = util.GetRegion(nodeList[0].Labels)
 			if !ok {
-				log.DedupedWarningf(5, "Failed to locate default region")
+				log.DedupedWarningf(5, "Failed to read default region from labels on node %s", nodeList[0].Name)
 			}
 		}
 
@@ -471,7 +471,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 			// TODO: Pass CloudProvider into CostModel on instantiation so this isn't so awkward
 			nodes, err := cmme.Model.GetNodeCost(cmme.CloudProvider)
 			if err != nil {
-				log.Warnf("Metric emission: error getting Node cost: %s", err)
+				log.Warnf("Error getting Node cost: %s", err)
 			}
 			for nodeName, node := range nodes {
 				// Emit costs, guarding against NaN inputs for custom pricing.
@@ -535,24 +535,27 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 				// don't record cpuCost, ramCost, or gpuCost in the case of wild outliers
 				// k8s api sometimes causes cost spikes as described here:
 				// https://github.com/opencost/opencost/issues/927
-				if cpuCost < outlierFactor*avgCosts.CpuCostAverage {
+				cpuOutlierCutoff := outlierFactor * avgCosts.CpuCostAverage
+				if cpuCost < cpuOutlierCutoff {
 					cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(cpuCost)
 					avgCosts.CpuCostAverage = (avgCosts.CpuCostAverage*avgCosts.NumCpuDataPoints + cpuCost) / (avgCosts.NumCpuDataPoints + 1)
 					avgCosts.NumCpuDataPoints += 1
 				} else {
-					log.Warnf("CPU cost outlier detected; skipping data point.")
+					log.Debugf("CPU cost outlier detected; skipping data point: %s had %f as cost, which is above %f.", nodeName, cpuCost, cpuOutlierCutoff)
 				}
-				if ramCost < outlierFactor*avgCosts.RamCostAverage {
+				ramOutlierCutoff := outlierFactor * avgCosts.RamCostAverage
+				if ramCost < ramOutlierCutoff {
 					cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(ramCost)
 					avgCosts.RamCostAverage = (avgCosts.RamCostAverage*avgCosts.NumRamDataPoints + ramCost) / (avgCosts.NumRamDataPoints + 1)
 					avgCosts.NumRamDataPoints += 1
 				} else {
-					log.Warnf("RAM cost outlier detected; skipping data point.")
+					log.Debugf("RAM cost outlier detected; skipping data point: %s had %f as cost, which is above %f.", nodeName, ramCost, ramOutlierCutoff)
 				}
 				// skip redording totalCost if any constituent costs were outliers
-				if cpuCost < outlierFactor*avgCosts.CpuCostAverage &&
-					ramCost < outlierFactor*avgCosts.RamCostAverage {
+				if cpuCost < cpuOutlierCutoff && ramCost < ramOutlierCutoff {
 					cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(totalCost)
+				} else {
+					log.Debugf("CPU and RAM outlier detected, not recording node %s total cost %f", nodeName, totalCost)
 				}
 
 				nodeCostAverages[labelKey] = avgCosts
@@ -568,7 +571,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 			// TODO: Pass CloudProvider into CostModel on instantiation so this isn't so awkward
 			loadBalancers, err := cmme.Model.GetLBCost(cmme.CloudProvider)
 			if err != nil {
-				log.Warnf("Metric emission: error getting LoadBalancer cost: %s", err)
+				log.Warnf("Error getting LoadBalancer cost: %s", err)
 			}
 			for lbKey, lb := range loadBalancers {
 				// TODO: parse (if necessary) and calculate cost associated with loadBalancer based on dynamic cloud prices fetched into each lb struct on GetLBCost() call
@@ -644,7 +647,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 
 				parameters, ok := storageClassMap[pv.Spec.StorageClassName]
 				if !ok {
-					log.Debugf("Unable to find parameters for storage class \"%s\". Does pv \"%s\" have a storageClassName?", pv.Spec.StorageClassName, pv.Name)
+					log.Debugf("Unable to find parameters for storage class \"%s\". Pv \"%s\" might have an empty or invalid storageClassName.", pv.Spec.StorageClassName, pv.Name)
 				}
 				var region string
 				if r, ok := util.GetRegion(pv.Labels); ok {
@@ -670,44 +673,44 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 			// longer exist
 			for labelString, seen := range nodeSeen {
 				if !seen {
-					log.Debugf("Removing %s from nodes", labelString)
+					log.Debugf("Removing metrics for %s, no data observed recently", labelString)
 					labels := getLabelStringsFromKey(labelString)
 
 					ok := cmme.NodeTotalPriceRecorder.DeleteLabelValues(labels...)
 					if ok {
-						log.Debugf("removed %s from totalprice", labelString)
+						log.Debugf("No data observed for node with labels %v, removed from totalprice", labels)
 					} else {
-						log.Errorf("FAILURE TO REMOVE %s from totalprice", labelString)
+						log.Warnf("Failed to remove label set %v from metric node_total_hourly_cost. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					ok = cmme.NodeSpotRecorder.DeleteLabelValues(labels...)
 					if ok {
-						log.Debugf("removed %s from spot records", labelString)
+						log.Debugf("No data observed for node with labels %v, removed from spot records", labels)
 					} else {
-						log.Errorf("FAILURE TO REMOVE %s from spot records", labelString)
+						log.Warnf("Failed to remove label set %v from metric kubecost_node_is_spot. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					ok = cmme.CPUPriceRecorder.DeleteLabelValues(labels...)
 					if ok {
-						log.Debugf("removed %s from cpuprice", labelString)
+						log.Debugf("No data observed for node with labels %v, removed from cpuprice", labels)
 					} else {
-						log.Errorf("FAILURE TO REMOVE %s from cpuprice", labelString)
+						log.Warnf("Failed to remove label set %v from metric node_cpu_hourly_cost. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					ok = cmme.GPUPriceRecorder.DeleteLabelValues(labels...)
 					if ok {
-						log.Debugf("removed %s from gpuprice", labelString)
+						log.Debugf("No data observed for node with labels %v, removed from gpuprice", labels)
 					} else {
-						log.Errorf("FAILURE TO REMOVE %s from gpuprice", labelString)
+						log.Warnf("Failed to remove label set %v from metric node_gpu_hourly_cost. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					ok = cmme.GPUCountRecorder.DeleteLabelValues(labels...)
 					if ok {
-						log.Debugf("removed %s from gpucount", labelString)
+						log.Debugf("No data observed for node with labels %v, removed from gpucount", labels)
 					} else {
-						log.Errorf("FAILURE TO REMOVE %s from gpucount", labelString)
+						log.Warnf("Failed to remove label set %v from metric node_gpu_count. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					ok = cmme.RAMPriceRecorder.DeleteLabelValues(labels...)
 					if ok {
-						log.Debugf("removed %s from ramprice", labelString)
+						log.Debugf("No data observed for node with labels %v, removed from ramprice", labels)
 					} else {
-						log.Errorf("FAILURE TO REMOVE %s from ramprice", labelString)
+						log.Warnf("Failed to remove label set %v from metric node_ram_hourly_cost. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					delete(nodeSeen, labelString)
 					delete(nodeCostAverages, labelString)
@@ -720,7 +723,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 					labels := getLabelStringsFromKey(labelString)
 					ok := cmme.LBCostRecorder.DeleteLabelValues(labels...)
 					if !ok {
-						log.Errorf("Metric emission: failed to delete LoadBalancer with labels: %v", labels)
+						log.Warnf("Failed to remove label set %v from metric kubecost_load_balancer_cost. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					delete(loadBalancerSeen, labelString)
 				} else {
@@ -730,17 +733,21 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 			for labelString, seen := range containerSeen {
 				if !seen {
 					labels := getLabelStringsFromKey(labelString)
-					ok := cmme.RAMAllocationRecorder.DeleteLabelValues(labels...)
-					if !ok {
-						log.Errorf("Metric emission: failed to delete RAMAllocation with labels: %v", labels)
-					}
-					ok = cmme.CPUAllocationRecorder.DeleteLabelValues(labels...)
-					if !ok {
-						log.Errorf("Metric emission: failed to delete CPUAllocation with labels: %v", labels)
-					}
-					ok = cmme.GPUAllocationRecorder.DeleteLabelValues(labels...)
-					if !ok {
-						log.Errorf("Metric emission: failed to delete GPUAllocation with labels: %v", labels)
+					if len(labels) >= 2 && labels[1] != unmountedPVsContainer { // special "pod" to contain the unmounted PVs - does not have RAM/CPU/...
+						ok := cmme.RAMAllocationRecorder.DeleteLabelValues(labels...)
+						if !ok {
+							log.Warnf("Failed to remove label set %v from metric container_memory_allocation_bytes. Failure to remove stale metrics may result in inaccurate data.", labels)
+						}
+						ok = cmme.CPUAllocationRecorder.DeleteLabelValues(labels...)
+						if !ok {
+							log.Warnf("Failed to remove label set %v from metric container_cpu_allocation. Failure to remove stale metrics may result in inaccurate data.", labels)
+						}
+						ok = cmme.GPUAllocationRecorder.DeleteLabelValues(labels...)
+						if !ok {
+							log.Warnf("Failed to remove label set %v from metric container_gpu_allocation. Failure to remove stale metrics may result in inaccurate data.", labels)
+						}
+					} else {
+						log.Debugf("Did not try to delete RAM/CPU/GPU for fake '%s' container: %v", unmountedPVsContainer, labels)
 					}
 					delete(containerSeen, labelString)
 				} else {
@@ -752,7 +759,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 					labels := getLabelStringsFromKey(labelString)
 					ok := cmme.PersistentVolumePriceRecorder.DeleteLabelValues(labels...)
 					if !ok {
-						log.Errorf("Metric emission: failed to delete PVPrice with labels: %v", labels)
+						log.Warnf("Failed to remove label set %v from metric pv_hourly_cost. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					delete(pvSeen, labelString)
 				} else {
@@ -764,7 +771,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 					labels := getLabelStringsFromKey(labelString)
 					ok := cmme.PVAllocationRecorder.DeleteLabelValues(labels...)
 					if !ok {
-						log.Errorf("Metric emission: failed to delete PVAllocation with labels: %v", labels)
+						log.Warnf("Failed to remove label set %v from metric pod_pvc_allocation. Failure to remove stale metrics may result in inaccurate data.", labels)
 					}
 					delete(pvcSeen, labelString)
 				} else {

+ 1 - 1
ui/Dockerfile.cross

@@ -5,7 +5,7 @@ ENV API_SERVER=0.0.0.0
 ENV UI_PORT=9090
 
 COPY ./dist /var/www
-COPY default.nginx.conf /etc/nginx/conf.d/
+COPY default.nginx.conf.template /etc/nginx/conf.d/default.nginx.conf.template
 COPY nginx.conf /etc/nginx/
 COPY ./docker-entrypoint.sh /usr/local/bin/