2
0
Эх сурвалжийг харах

feat(labels): Add arch label to cost metrics

It's challenging today to display a breakdown of costs by chipset for
workloads. This adds an `arch` label to all cost metrics so that queries
can be written without joining to `kube_node_labels` to understand if
the underlying architecture the workload is running on.

- Closes #1916

Signed-off-by: pokom <mark.poko@grafana.com>
pokom 3 жил өмнө
parent
commit
f1acad05b5

+ 1 - 0
pkg/cloud/models/models.go

@@ -60,6 +60,7 @@ type Node struct {
 	Reserved         *ReservedInstanceData `json:"reserved,omitempty"`
 	ProviderID       string                `json:"providerID,omitempty"`
 	PricingType      PricingType           `json:"pricingType,omitempty"`
+	ArchType         string                `json:"archType,omitempty"`
 }
 
 // IsSpot determines whether or not a Node uses spot by usage type

+ 4 - 0
pkg/costmodel/costmodel.go

@@ -1002,6 +1002,10 @@ func (cm *CostModel) GetNodeCost(cp costAnalyzerCloud.Provider) (map[string]*cos
 			region, _ := util.GetRegion(n.Labels)
 			newCnode.Region = region
 		}
+		if newCnode.ArchType == "" {
+			arch, _ := util.GetArchType(n.Labels)
+			newCnode.ArchType = arch
+		}
 		newCnode.ProviderID = n.Spec.ProviderID
 
 		var cpu float64

+ 10 - 10
pkg/costmodel/metrics.go

@@ -147,7 +147,7 @@ func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider model
 		cpuGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "node_cpu_hourly_cost",
 			Help: "node_cpu_hourly_cost hourly cost for each cpu on this node",
-		}, []string{"instance", "node", "instance_type", "region", "provider_id"})
+		}, []string{"instance", "node", "instance_type", "region", "provider_id", "arch"})
 		if _, disabled := disabledMetrics["node_cpu_hourly_cost"]; !disabled {
 			toRegisterGV = append(toRegisterGV, cpuGv)
 		}
@@ -155,7 +155,7 @@ func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider model
 		ramGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "node_ram_hourly_cost",
 			Help: "node_ram_hourly_cost hourly cost for each gb of ram on this node",
-		}, []string{"instance", "node", "instance_type", "region", "provider_id"})
+		}, []string{"instance", "node", "instance_type", "region", "provider_id", "arch"})
 		if _, disabled := disabledMetrics["node_ram_hourly_cost"]; !disabled {
 			toRegisterGV = append(toRegisterGV, ramGv)
 		}
@@ -163,7 +163,7 @@ func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider model
 		gpuGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "node_gpu_hourly_cost",
 			Help: "node_gpu_hourly_cost hourly cost for each gpu on this node",
-		}, []string{"instance", "node", "instance_type", "region", "provider_id"})
+		}, []string{"instance", "node", "instance_type", "region", "provider_id", "arch"})
 		if _, disabled := disabledMetrics["node_gpu_hourly_cost"]; !disabled {
 			toRegisterGV = append(toRegisterGV, gpuGv)
 		}
@@ -171,7 +171,7 @@ func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider model
 		gpuCountGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "node_gpu_count",
 			Help: "node_gpu_count count of gpu on this node",
-		}, []string{"instance", "node", "instance_type", "region", "provider_id"})
+		}, []string{"instance", "node", "instance_type", "region", "provider_id", "arch"})
 		if _, disabled := disabledMetrics["node_gpu_count"]; !disabled {
 			toRegisterGV = append(toRegisterGV, gpuCountGv)
 		}
@@ -195,7 +195,7 @@ func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider model
 		totalGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "node_total_hourly_cost",
 			Help: "node_total_hourly_cost Total node cost per hour",
-		}, []string{"instance", "node", "instance_type", "region", "provider_id"})
+		}, []string{"instance", "node", "instance_type", "region", "provider_id", "arch"})
 		if _, disabled := disabledMetrics["node_total_hourly_cost"]; !disabled {
 			toRegisterGV = append(toRegisterGV, totalGv)
 		}
@@ -528,22 +528,22 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 					nodeCostAverages[labelKey] = avgCosts
 				}
 
-				cmme.GPUCountRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpu)
-				cmme.GPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpuCost)
+				cmme.GPUCountRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(gpu)
+				cmme.GPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(gpuCost)
 
 				const outlierFactor float64 = 30
 				// don't record cpuCost, ramCost, or gpuCost in the case of wild outliers
 				// k8s api sometimes causes cost spikes as described here:
 				// https://github.com/opencost/opencost/issues/927
 				if cpuCost < outlierFactor*avgCosts.CpuCostAverage {
-					cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(cpuCost)
+					cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(cpuCost)
 					avgCosts.CpuCostAverage = (avgCosts.CpuCostAverage*avgCosts.NumCpuDataPoints + cpuCost) / (avgCosts.NumCpuDataPoints + 1)
 					avgCosts.NumCpuDataPoints += 1
 				} else {
 					log.Warnf("CPU cost outlier detected; skipping data point.")
 				}
 				if ramCost < outlierFactor*avgCosts.RamCostAverage {
-					cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(ramCost)
+					cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(ramCost)
 					avgCosts.RamCostAverage = (avgCosts.RamCostAverage*avgCosts.NumRamDataPoints + ramCost) / (avgCosts.NumRamDataPoints + 1)
 					avgCosts.NumRamDataPoints += 1
 				} else {
@@ -552,7 +552,7 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 				// skip redording totalCost if any constituent costs were outliers
 				if cpuCost < outlierFactor*avgCosts.CpuCostAverage &&
 					ramCost < outlierFactor*avgCosts.RamCostAverage {
-					cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(totalCost)
+					cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID, node.ArchType).Set(totalCost)
 				}
 
 				nodeCostAverages[labelKey] = avgCosts

+ 10 - 0
pkg/util/compat.go

@@ -45,3 +45,13 @@ func GetOperatingSystem(labels map[string]string) (string, bool) {
 		return "", false
 	}
 }
+
+func GetArchType(labels map[string]string) (string, bool) {
+	if _, ok := labels[v1.LabelArchStable]; ok {
+		return labels[v1.LabelArchStable], true
+	} else if _, ok := labels["beta.kubernetes.io/arch"]; ok {
+		return labels["beta.kubernetes.io/arch"], true
+	} else {
+		return "", false
+	}
+}

+ 56 - 0
pkg/util/compat_test.go

@@ -0,0 +1,56 @@
+package util
+
+import (
+	"testing"
+)
+
+func TestGetArchType(t *testing.T) {
+	type args struct {
+		labels map[string]string
+	}
+	tests := map[string]struct {
+		args  args
+		want  string
+		found bool
+	}{
+		"amd64 beta": {
+			args: args{
+				labels: map[string]string{
+					"beta.kubernetes.io/arch": "amd64",
+				},
+			},
+			want:  "amd64",
+			found: true,
+		},
+		"arm64 beta": {
+			args: args{
+				labels: map[string]string{
+					"beta.kubernetes.io/arch": "arm64",
+				},
+			},
+			want:  "arm64",
+			found: true,
+		},
+		"amd64": {
+			args: args{
+				labels: map[string]string{
+					"kubernetes.io/arch": "amd64",
+				},
+			},
+			want:  "amd64",
+			found: true,
+		},
+	}
+	for name, tt := range tests {
+		t.Run(name, func(t *testing.T) {
+			got, found := GetArchType(tt.args.labels)
+			if found != tt.found {
+				t.Errorf("GetArchType() error = %v, wantErr %v", found, tt.found)
+				return
+			}
+			if got != tt.want {
+				t.Errorf("GetArchType() got = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}