Просмотр исходного кода

Fix bug in Allocation metrics (#1992)

* Print CPU requests, CPU used, CPU Allocation

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

* Update getContainerAllocation() function

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

* Cleanup printing & comments

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

* Clean spacing & comments

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

* Adjust signature of getContainerAllocation()

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

* Adjust logic for Timestamps. Add unit tests.

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

* Always call getContainerAllocation regardless of whether Req or Used slices have elements

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>

---------

Signed-off-by: thomasvn <thomasnguyen96@gmail.com>
Thomas Nguyen 2 лет назад
Родитель
Сommit
302e55baa4
2 измененных файлов с 244 добавлено и 32 удалено
  1. 94 32
      pkg/costmodel/costmodel.go
  2. 150 0
      pkg/costmodel/costmodel_test.go

+ 94 - 32
pkg/costmodel/costmodel.go

@@ -500,11 +500,9 @@ func (cm *CostModel) ComputeCostData(cli prometheusClient.Client, cp costAnalyze
 				// for the units of memory and CPU.
 				ramRequestBytes := container.Resources.Requests.Memory().Value()
 
-				// Because RAM (and CPU) information isn't coming from Prometheus, it won't
-				// have a timestamp associated with it. We need to provide a timestamp,
-				// otherwise the vector op that gets applied to take the max of usage
-				// and request won't work properly and will only take into account
-				// usage.
+				// Because information on container RAM & CPU requests isn't
+				// coming from Prometheus, it won't have a timestamp associated
+				// with it. We need to provide a timestamp.
 				RAMReqV := []*util.Vector{
 					{
 						Value:     float64(ramRequestBytes),
@@ -582,8 +580,25 @@ func (cm *CostModel) ComputeCostData(cli prometheusClient.Client, cp costAnalyze
 					ClusterID:       clusterID,
 					ClusterName:     cm.ClusterMap.NameFor(clusterID),
 				}
-				costs.CPUAllocation = getContainerAllocation(costs.CPUReq, costs.CPUUsed, "CPU")
-				costs.RAMAllocation = getContainerAllocation(costs.RAMReq, costs.RAMUsed, "RAM")
+
+				var cpuReq, cpuUse *util.Vector
+				if len(costs.CPUReq) > 0 {
+					cpuReq = costs.CPUReq[0]
+				}
+				if len(costs.CPUUsed) > 0 {
+					cpuUse = costs.CPUUsed[0]
+				}
+				costs.CPUAllocation = getContainerAllocation(cpuReq, cpuUse, "CPU")
+
+				var ramReq, ramUse *util.Vector
+				if len(costs.RAMReq) > 0 {
+					ramReq = costs.RAMReq[0]
+				}
+				if len(costs.RAMUsed) > 0 {
+					ramUse = costs.RAMUsed[0]
+				}
+				costs.RAMAllocation = getContainerAllocation(ramReq, ramUse, "RAM")
+
 				if filterNamespace == "" {
 					containerNameCost[newKey] = costs
 				} else if costs.Namespace == filterNamespace {
@@ -650,8 +665,25 @@ func (cm *CostModel) ComputeCostData(cli prometheusClient.Client, cp costAnalyze
 				ClusterID:       c.ClusterID,
 				ClusterName:     cm.ClusterMap.NameFor(c.ClusterID),
 			}
-			costs.CPUAllocation = getContainerAllocation(costs.CPUReq, costs.CPUUsed, "CPU")
-			costs.RAMAllocation = getContainerAllocation(costs.RAMReq, costs.RAMUsed, "RAM")
+
+			var cpuReq, cpuUse *util.Vector
+			if len(costs.CPUReq) > 0 {
+				cpuReq = costs.CPUReq[0]
+			}
+			if len(costs.CPUUsed) > 0 {
+				cpuUse = costs.CPUUsed[0]
+			}
+			costs.CPUAllocation = getContainerAllocation(cpuReq, cpuUse, "CPU")
+
+			var ramReq, ramUse *util.Vector
+			if len(costs.RAMReq) > 0 {
+				ramReq = costs.RAMReq[0]
+			}
+			if len(costs.RAMUsed) > 0 {
+				ramUse = costs.RAMUsed[0]
+			}
+			costs.RAMAllocation = getContainerAllocation(ramReq, ramUse, "RAM")
+
 			if filterNamespace == "" {
 				containerNameCost[key] = costs
 				missingContainers[key] = costs
@@ -828,32 +860,62 @@ func findDeletedNodeInfo(cli prometheusClient.Client, missingNodes map[string]*c
 	return nil
 }
 
-func getContainerAllocation(req []*util.Vector, used []*util.Vector, allocationType string) []*util.Vector {
-	// The result of the normalize operation will be a new []*util.Vector to replace the requests
-	allocationOp := func(r *util.Vector, x *float64, y *float64) bool {
-		if x != nil && y != nil {
-			x1 := *x
-			if math.IsNaN(x1) {
-				log.Warnf("NaN value found during %s allocation calculation for requests.", allocationType)
-				x1 = 0.0
-			}
-			y1 := *y
-			if math.IsNaN(y1) {
-				log.Warnf("NaN value found during %s allocation calculation for used.", allocationType)
-				y1 = 0.0
-			}
-
-			r.Value = math.Max(x1, y1)
-		} else if x != nil {
-			r.Value = *x
-		} else if y != nil {
-			r.Value = *y
+// getContainerAllocation takes the max between request and usage. This function
+// returns a slice containing a single element describing the container's
+// allocation.
+//
+// Additionally, the timestamp of the allocation will be the highest value
+// timestamp between the two vectors. This mitigates situations where
+// Timestamp=0. This should have no effect on the metrics emitted by the
+// CostModelMetricsEmitter
+func getContainerAllocation(req *util.Vector, used *util.Vector, allocationType string) []*util.Vector {
+	var result []*util.Vector
+
+	if req != nil && used != nil {
+		x1 := req.Value
+		if math.IsNaN(x1) {
+			log.Warnf("NaN value found during %s allocation calculation for requests.", allocationType)
+			x1 = 0.0
+		}
+		y1 := used.Value
+		if math.IsNaN(y1) {
+			log.Warnf("NaN value found during %s allocation calculation for used.", allocationType)
+			y1 = 0.0
+		}
+		result = []*util.Vector{
+			{
+				Value:     math.Max(x1, y1),
+				Timestamp: math.Max(req.Timestamp, used.Timestamp),
+			},
+		}
+		if result[0].Value == 0 && result[0].Timestamp == 0 {
+			log.Warnf("No request or usage data found during %s allocation calculation. Setting allocation to 0.", allocationType)
+		}
+	} else if req != nil {
+		result = []*util.Vector{
+			{
+				Value:     req.Value,
+				Timestamp: req.Timestamp,
+			},
+		}
+	} else if used != nil {
+		result = []*util.Vector{
+			{
+				Value:     used.Value,
+				Timestamp: used.Timestamp,
+			},
+		}
+	} else {
+		log.Warnf("No request or usage data found during %s allocation calculation. Setting allocation to 0.", allocationType)
+		result = []*util.Vector{
+			{
+				Value:     0,
+				Timestamp: float64(time.Now().UTC().Unix()),
+			},
 		}
-
-		return true
 	}
 
-	return util.ApplyVectorOp(req, used, allocationOp)
+	return result
 }
 
 func addPVData(cache clustercache.ClusterCache, pvClaimMapping map[string]*PersistentVolumeClaimData, cloud costAnalyzerCloud.Provider) error {

+ 150 - 0
pkg/costmodel/costmodel_test.go

@@ -2,6 +2,8 @@ package costmodel
 
 import (
 	"testing"
+
+	"github.com/opencost/opencost/pkg/util"
 )
 
 func Test_CostData_GetController_CronJob(t *testing.T) {
@@ -65,3 +67,151 @@ func Test_CostData_GetController_CronJob(t *testing.T) {
 		})
 	}
 }
+
+func Test_getContainerAllocation(t *testing.T) {
+	cases := []struct {
+		name string
+		cd   CostData
+
+		expectedCPUAllocation []*util.Vector
+		expectedRAMAllocation []*util.Vector
+	}{
+		{
+			name: "Requests greater than usage",
+			cd: CostData{
+				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
+				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
+				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
+				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
+			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
+		},
+		{
+			name: "Requests less than usage",
+			cd: CostData{
+				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
+				CPUUsed: []*util.Vector{{Value: 2.2, Timestamp: 1686929350}},
+				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
+				RAMUsed: []*util.Vector{{Value: 75000000, Timestamp: 1686929350}},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: 2.2, Timestamp: 1686929350}},
+			expectedRAMAllocation: []*util.Vector{{Value: 75000000, Timestamp: 1686929350}},
+		},
+		{
+			// Expected behavior for getContainerAllocation is to always use the
+			// highest Timestamp value. The significance of 10 seconds comes
+			// from the current default in ApplyVectorOp() in
+			// pkg/util/vector.go.
+			name: "Mismatched timestamps less than 10 seconds apart",
+			cd: CostData{
+				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929354}},
+				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
+				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929354}},
+				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929354}},
+			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929354}},
+		},
+		{
+			// Expected behavior for getContainerAllocation is to always use the
+			// hightest Timestamp value. The significance of 10 seconds comes
+			// from the current default in ApplyVectorOp() in
+			// pkg/util/vector.go.
+			name: "Mismatched timestamps greater than 10 seconds apart",
+			cd: CostData{
+				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929399}},
+				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
+				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929399}},
+				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929399}},
+			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929399}},
+		},
+		{
+			name: "Requests has no values",
+			cd: CostData{
+				CPUReq:  []*util.Vector{{Value: 0, Timestamp: 0}},
+				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
+				RAMReq:  []*util.Vector{{Value: 0, Timestamp: 0}},
+				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
+			expectedRAMAllocation: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+		},
+		{
+			name: "Usage has no values",
+			cd: CostData{
+				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
+				CPUUsed: []*util.Vector{{Value: 0, Timestamp: 0}},
+				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
+				RAMUsed: []*util.Vector{{Value: 0, Timestamp: 0}},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
+			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
+		},
+		{
+			// WRN Log should be thrown
+			name: "Both have no values",
+			cd: CostData{
+				CPUReq:  []*util.Vector{{Value: 0, Timestamp: 0}},
+				CPUUsed: []*util.Vector{{Value: 0, Timestamp: 0}},
+				RAMReq:  []*util.Vector{{Value: 0, Timestamp: 0}},
+				RAMUsed: []*util.Vector{{Value: 0, Timestamp: 0}},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: 0, Timestamp: 0}},
+			expectedRAMAllocation: []*util.Vector{{Value: 0, Timestamp: 0}},
+		},
+		{
+			name: "Requests is Nil",
+			cd: CostData{
+				CPUReq:  []*util.Vector{nil},
+				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
+				RAMReq:  []*util.Vector{nil},
+				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
+			expectedRAMAllocation: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+		},
+		{
+			name: "Usage is nil",
+			cd: CostData{
+				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
+				CPUUsed: []*util.Vector{nil},
+				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
+				RAMUsed: []*util.Vector{nil},
+			},
+
+			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
+			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
+		},
+	}
+
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			cpuAllocation := getContainerAllocation(c.cd.CPUReq[0], c.cd.CPUUsed[0], "CPU")
+			ramAllocation := getContainerAllocation(c.cd.RAMReq[0], c.cd.RAMUsed[0], "RAM")
+
+			if cpuAllocation[0].Value != c.expectedCPUAllocation[0].Value {
+				t.Errorf("CPU Allocation mismatch. Expected Value: %f. Got: %f", cpuAllocation[0].Value, c.expectedCPUAllocation[0].Value)
+			}
+			if cpuAllocation[0].Timestamp != c.expectedCPUAllocation[0].Timestamp {
+				t.Errorf("CPU Allocation mismatch. Expected Timestamp: %f. Got: %f", cpuAllocation[0].Timestamp, c.expectedCPUAllocation[0].Timestamp)
+			}
+			if ramAllocation[0].Value != c.expectedRAMAllocation[0].Value {
+				t.Errorf("RAM Allocation mismatch. Expected Value: %f. Got: %f", ramAllocation[0].Value, c.expectedRAMAllocation[0].Value)
+			}
+			if ramAllocation[0].Timestamp != c.expectedRAMAllocation[0].Timestamp {
+				t.Errorf("RAM Allocation mismatch. Expected Timestamp: %f. Got: %f", ramAllocation[0].Timestamp, c.expectedRAMAllocation[0].Timestamp)
+			}
+		})
+	}
+}