Explorar el Código

Refactor ComputeCostData and add unit tests (#3295)

Signed-off-by: sneax <paladesh600@gmail.com>
Signed-off-by: segfault_bits <149077711+sneaxhuh@users.noreply.github.com>
Co-authored-by: Matt Bolt <mbolt35@gmail.com>
segfault_bits hace 7 meses
padre
commit
3dfebff673
Se han modificado 2 ficheros con 164 adiciones y 152 borrados
  1. 50 37
      pkg/costmodel/costmodel.go
  2. 114 115
      pkg/costmodel/costmodel_test.go

+ 50 - 37
pkg/costmodel/costmodel.go

@@ -139,14 +139,7 @@ func (cm *CostModel) ComputeCostData(start, end time.Time) (map[string]*CostData
 	ds := cm.DataSource
 	mq := ds.Metrics()
 
-	grp := source.NewQueryGroup()
-
-	resChRAMUsage := source.WithGroup(grp, mq.QueryRAMUsageAvg(start, end))
-	resChCPUUsage := source.WithGroup(grp, mq.QueryCPUUsageAvg(start, end))
-	resChNetZoneRequests := source.WithGroup(grp, mq.QueryNetZoneGiB(start, end))
-	resChNetRegionRequests := source.WithGroup(grp, mq.QueryNetRegionGiB(start, end))
-	resChNetInternetRequests := source.WithGroup(grp, mq.QueryNetInternetGiB(start, end))
-
+	// Get Kubernetes data
 	// Pull pod information from k8s API
 	podlist := cm.Cache.GetAllPods()
 
@@ -170,30 +163,10 @@ func (cm *CostModel) ComputeCostData(start, end time.Time) (map[string]*CostData
 		return nil, err
 	}
 
-	// Process Prometheus query results. Handle errors using ctx.Errors.
-	resRAMUsage, _ := resChRAMUsage.Await()
-	resCPUUsage, _ := resChCPUUsage.Await()
-	resNetZoneRequests, _ := resChNetZoneRequests.Await()
-	resNetRegionRequests, _ := resChNetRegionRequests.Await()
-	resNetInternetRequests, _ := resChNetInternetRequests.Await()
-
-	// NOTE: The way we currently handle errors and warnings only early returns if there is an error. Warnings
-	// NOTE: will not propagate unless coupled with errors.
-	if grp.HasErrors() {
-		// To keep the context of where the errors are occurring, we log the errors here and pass them the error
-		// back to the caller. The caller should handle the specific case where error is an ErrorCollection
-		for _, queryErr := range grp.Errors() {
-			if queryErr.Error != nil {
-				log.Errorf("ComputeCostData: Request Error: %s", queryErr.Error)
-			}
-			if queryErr.ParseError != nil {
-				log.Errorf("ComputeCostData: Parsing Error: %s", queryErr.ParseError)
-			}
-		}
-
-		// ErrorCollection is an collection of errors wrapped in a single error implementation
-		// We opt to not return an error for the sake of running as a pure exporter.
-		log.Warnf("ComputeCostData: continuing despite prometheus errors: %s", grp.Error())
+	// Get metrics data
+	resRAMUsage, resCPUUsage, resNetZoneRequests, resNetRegionRequests, resNetInternetRequests, err := queryMetrics(mq, start, end)
+	if err != nil {
+		log.Warnf("ComputeCostData: continuing despite metrics errors: %s", err)
 	}
 
 	defer measureTime(time.Now(), profileThreshold, "ComputeCostData: Processing Query Data")
@@ -254,7 +227,7 @@ func (cm *CostModel) ComputeCostData(start, end time.Time) (map[string]*CostData
 			return nil, err
 		}
 		for _, c := range cs {
-			containers[c.Key()] = true // captures any containers that existed for a time < a prometheus scrape interval. We currently charge 0 for this but should charge something.
+			containers[c.Key()] = true // captures any containers that existed for a time < a metrics scrape interval. We currently charge 0 for this but should charge something.
 			currentContainers[c.Key()] = *pod
 		}
 	}
@@ -268,6 +241,7 @@ func (cm *CostModel) ComputeCostData(start, end time.Time) (map[string]*CostData
 		// deleted so we have usage information but not request information. In that case,
 		// we return partial data for CPU and RAM: only usage and not requests.
 		if pod, ok := currentContainers[key]; ok {
+
 			podName := pod.Name
 			ns := pod.Namespace
 
@@ -361,7 +335,7 @@ func (cm *CostModel) ComputeCostData(start, end time.Time) (map[string]*CostData
 				ramRequestBytes := container.Resources.Requests.Memory().Value()
 
 				// Because information on container RAM & CPU requests isn't
-				// coming from Prometheus, it won't have a timestamp associated
+				// coming from metrics, it won't have a timestamp associated
 				// with it. We need to provide a timestamp.
 				RAMReqV := []*util.Vector{
 					{
@@ -461,7 +435,7 @@ func (cm *CostModel) ComputeCostData(start, end time.Time) (map[string]*CostData
 				containerNameCost[newKey] = costs
 			}
 		} else {
-			// The container has been deleted. Not all information is sent to prometheus via ksm, so fill out what we can without k8s api
+			// The container has been deleted. Not all information is sent to metrics via ksm, so fill out what we can without k8s api
 			log.Debug("The container " + key + " has been deleted. Calculating allocation but resulting object will be missing data.")
 			c, err := NewContainerMetricFromKey(key)
 			if err != nil {
@@ -543,6 +517,7 @@ func (cm *CostModel) ComputeCostData(start, end time.Time) (map[string]*CostData
 			missingContainers[key] = costs
 		}
 	}
+
 	// Use unmounted pvs to create a mapping of "Unmounted-<Namespace>" containers
 	// to pass along the cost data
 	unmounted := findUnmountedPVCostData(cm.ClusterMap, unmountedPVs, namespaceLabelsMapping, namespaceAnnotationsMapping)
@@ -564,6 +539,44 @@ func (cm *CostModel) ComputeCostData(start, end time.Time) (map[string]*CostData
 	return containerNameCost, err
 }
 
+func queryMetrics(mq source.MetricsQuerier, start, end time.Time) ([]*source.ContainerMetricResult, []*source.ContainerMetricResult, []*source.NetZoneGiBResult, []*source.NetRegionGiBResult, []*source.NetInternetGiBResult, error) {
+	grp := source.NewQueryGroup()
+
+	resChRAMUsage := source.WithGroup(grp, mq.QueryRAMUsageAvg(start, end))
+	resChCPUUsage := source.WithGroup(grp, mq.QueryCPUUsageAvg(start, end))
+	resChNetZoneRequests := source.WithGroup(grp, mq.QueryNetZoneGiB(start, end))
+	resChNetRegionRequests := source.WithGroup(grp, mq.QueryNetRegionGiB(start, end))
+	resChNetInternetRequests := source.WithGroup(grp, mq.QueryNetInternetGiB(start, end))
+
+	// Process metrics query results. Handle errors using ctx.Errors.
+	resRAMUsage, _ := resChRAMUsage.Await()
+	resCPUUsage, _ := resChCPUUsage.Await()
+	resNetZoneRequests, _ := resChNetZoneRequests.Await()
+	resNetRegionRequests, _ := resChNetRegionRequests.Await()
+	resNetInternetRequests, _ := resChNetInternetRequests.Await()
+
+	// NOTE: The way we currently handle errors and warnings only early returns if there is an error. Warnings
+	// NOTE: will not propagate unless coupled with errors.
+	if grp.HasErrors() {
+		// To keep the context of where the errors are occurring, we log the errors here and pass them the error
+		// back to the caller. The caller should handle the specific case where error is an ErrorCollection
+		for _, queryErr := range grp.Errors() {
+			if queryErr.Error != nil {
+				log.Errorf("ComputeCostData: Request Error: %s", queryErr.Error)
+			}
+			if queryErr.ParseError != nil {
+				log.Errorf("ComputeCostData: Parsing Error: %s", queryErr.ParseError)
+			}
+		}
+
+		// ErrorCollection is an collection of errors wrapped in a single error implementation
+		// We opt to not return an error for the sake of running as a pure exporter.
+		return resRAMUsage, resCPUUsage, resNetZoneRequests, resNetRegionRequests, resNetInternetRequests, grp.Error()
+	}
+
+	return resRAMUsage, resCPUUsage, resNetZoneRequests, resNetRegionRequests, resNetInternetRequests, nil
+}
+
 func findUnmountedPVCostData(clusterMap clusters.ClusterMap, unmountedPVs map[string][]*PersistentVolumeClaimData, namespaceLabelsMapping map[string]map[string]string, namespaceAnnotationsMapping map[string]map[string]string) map[string]*CostData {
 	costs := make(map[string]*CostData)
 	if len(unmountedPVs) == 0 {
@@ -673,14 +686,14 @@ func findDeletedNodeInfo(dataSource source.OpenCostDataSource, missingNodes map[
 		}
 
 		if len(cpuCosts) == 0 {
-			log.Infof("Kubecost prometheus metrics not currently available. Ingest this server's /metrics endpoint to get that data.")
+			log.Infof("Opencost metrics not currently available. Ingest this server's /metrics endpoint to get that data.")
 		}
 
 		for node, costv := range cpuCosts {
 			if _, ok := missingNodes[node]; ok {
 				missingNodes[node].VCPUCost = fmt.Sprintf("%f", costv[0].Value)
 			} else {
-				log.DedupedWarningf(5, "Node `%s` in prometheus but not k8s api", node)
+				log.DedupedWarningf(5, "Node `%s` in metrics but not k8s api", node)
 			}
 		}
 		for node, costv := range ramCosts {

+ 114 - 115
pkg/costmodel/costmodel_test.go

@@ -1,10 +1,12 @@
 package costmodel
 
 import (
+	"math"
 	"math/rand"
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"github.com/opencost/opencost/core/pkg/clustercache"
 	"github.com/opencost/opencost/core/pkg/util"
 	"github.com/stretchr/testify/assert"
@@ -201,149 +203,146 @@ func Test_CostData_GetController_CronJob(t *testing.T) {
 	}
 }
 
-func Test_getContainerAllocation(t *testing.T) {
+func TestGetContainerAllocation(t *testing.T) {
 	cases := []struct {
-		name string
-		cd   CostData
-
-		expectedCPUAllocation []*util.Vector
-		expectedRAMAllocation []*util.Vector
+		name           string
+		req            *util.Vector
+		used           *util.Vector
+		allocationType string
+		expected       []*util.Vector
 	}{
 		{
-			name: "Requests greater than usage",
-			cd: CostData{
-				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
-				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
-				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
-				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			name: "request > usage",
+			req: &util.Vector{
+				Value:     100,
+				Timestamp: 1672531200,
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
-			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
-		},
-		{
-			name: "Requests less than usage",
-			cd: CostData{
-				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
-				CPUUsed: []*util.Vector{{Value: 2.2, Timestamp: 1686929350}},
-				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
-				RAMUsed: []*util.Vector{{Value: 75000000, Timestamp: 1686929350}},
+			used: &util.Vector{
+				Value:     50,
+				Timestamp: 1672531200,
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: 2.2, Timestamp: 1686929350}},
-			expectedRAMAllocation: []*util.Vector{{Value: 75000000, Timestamp: 1686929350}},
-		},
-		{
-			// Expected behavior for getContainerAllocation is to always use the
-			// highest Timestamp value. The significance of 10 seconds comes
-			// from the current default in ApplyVectorOp() in
-			// pkg/util/vector.go.
-			name: "Mismatched timestamps less than 10 seconds apart",
-			cd: CostData{
-				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929354}},
-				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
-				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929354}},
-				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			allocationType: "RAM",
+			expected: []*util.Vector{
+				{
+					Value:     100,
+					Timestamp: 1672531200,
+				},
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929354}},
-			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929354}},
 		},
 		{
-			// Expected behavior for getContainerAllocation is to always use the
-			// hightest Timestamp value. The significance of 10 seconds comes
-			// from the current default in ApplyVectorOp() in
-			// pkg/util/vector.go.
-			name: "Mismatched timestamps greater than 10 seconds apart",
-			cd: CostData{
-				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929399}},
-				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
-				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929399}},
-				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			name: "usage > request",
+			req: &util.Vector{
+				Value:     50,
+				Timestamp: 1672531200,
+			},
+			used: &util.Vector{
+				Value:     100,
+				Timestamp: 1672531200,
+			},
+			allocationType: "RAM",
+			expected: []*util.Vector{
+				{
+					Value:     100,
+					Timestamp: 1672531200,
+				},
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929399}},
-			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929399}},
 		},
 		{
-			name: "Requests has no values",
-			cd: CostData{
-				CPUReq:  []*util.Vector{{Value: 0, Timestamp: 0}},
-				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
-				RAMReq:  []*util.Vector{{Value: 0, Timestamp: 0}},
-				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			name: "only request is non-nil",
+			req: &util.Vector{
+				Value:     100,
+				Timestamp: 1672531200,
+			},
+			used:           nil,
+			allocationType: "CPU",
+			expected: []*util.Vector{
+				{
+					Value:     100,
+					Timestamp: 1672531200,
+				},
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
-			expectedRAMAllocation: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
 		},
 		{
-			name: "Usage has no values",
-			cd: CostData{
-				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
-				CPUUsed: []*util.Vector{{Value: 0, Timestamp: 0}},
-				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
-				RAMUsed: []*util.Vector{{Value: 0, Timestamp: 0}},
+			name: "only used is non-nil",
+			req:  nil,
+			used: &util.Vector{
+				Value:     100,
+				Timestamp: 1672531200,
+			},
+			allocationType: "CPU",
+			expected: []*util.Vector{
+				{
+					Value:     100,
+					Timestamp: 1672531200,
+				},
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
-			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
 		},
 		{
-			// WRN Log should be thrown
-			name: "Both have no values",
-			cd: CostData{
-				CPUReq:  []*util.Vector{{Value: 0, Timestamp: 0}},
-				CPUUsed: []*util.Vector{{Value: 0, Timestamp: 0}},
-				RAMReq:  []*util.Vector{{Value: 0, Timestamp: 0}},
-				RAMUsed: []*util.Vector{{Value: 0, Timestamp: 0}},
+			name:           "both req and used are nil",
+			req:            nil,
+			used:           nil,
+			allocationType: "GPU",
+			expected: []*util.Vector{
+				{
+					Value:     0,
+					Timestamp: float64(time.Now().UTC().Unix()),
+				},
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: 0, Timestamp: 0}},
-			expectedRAMAllocation: []*util.Vector{{Value: 0, Timestamp: 0}},
 		},
 		{
-			name: "Requests is Nil",
-			cd: CostData{
-				CPUReq:  []*util.Vector{nil},
-				CPUUsed: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
-				RAMReq:  []*util.Vector{nil},
-				RAMUsed: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
+			name: "NaN in request value",
+			req: &util.Vector{
+				Value:     math.NaN(),
+				Timestamp: 1672531200,
+			},
+			used: &util.Vector{
+				Value:     50,
+				Timestamp: 1672531200,
+			},
+			allocationType: "RAM",
+			expected: []*util.Vector{
+				{
+					Value:     50,
+					Timestamp: 1672531200,
+				},
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: .01, Timestamp: 1686929350}},
-			expectedRAMAllocation: []*util.Vector{{Value: 5500000, Timestamp: 1686929350}},
 		},
 		{
-			name: "Usage is nil",
-			cd: CostData{
-				CPUReq:  []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
-				CPUUsed: []*util.Vector{nil},
-				RAMReq:  []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
-				RAMUsed: []*util.Vector{nil},
+			name: "NaN in used value",
+			req: &util.Vector{
+				Value:     100,
+				Timestamp: 1672531200,
+			},
+			used: &util.Vector{
+				Value:     math.NaN(),
+				Timestamp: 1672531200,
+			},
+			allocationType: "CPU",
+			expected: []*util.Vector{
+				{
+					Value:     100,
+					Timestamp: 1672531200,
+				},
 			},
-
-			expectedCPUAllocation: []*util.Vector{{Value: 1.0, Timestamp: 1686929350}},
-			expectedRAMAllocation: []*util.Vector{{Value: 10000000, Timestamp: 1686929350}},
 		},
 	}
 
-	for _, c := range cases {
-		t.Run(c.name, func(t *testing.T) {
-			cpuAllocation := getContainerAllocation(c.cd.CPUReq[0], c.cd.CPUUsed[0], "CPU")
-			ramAllocation := getContainerAllocation(c.cd.RAMReq[0], c.cd.RAMUsed[0], "RAM")
-
-			if cpuAllocation[0].Value != c.expectedCPUAllocation[0].Value {
-				t.Errorf("CPU Allocation mismatch. Expected Value: %f. Got: %f", cpuAllocation[0].Value, c.expectedCPUAllocation[0].Value)
-			}
-			if cpuAllocation[0].Timestamp != c.expectedCPUAllocation[0].Timestamp {
-				t.Errorf("CPU Allocation mismatch. Expected Timestamp: %f. Got: %f", cpuAllocation[0].Timestamp, c.expectedCPUAllocation[0].Timestamp)
-			}
-			if ramAllocation[0].Value != c.expectedRAMAllocation[0].Value {
-				t.Errorf("RAM Allocation mismatch. Expected Value: %f. Got: %f", ramAllocation[0].Value, c.expectedRAMAllocation[0].Value)
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			// For the nil case, the timestamp is dynamic, so we need to handle it separately
+			if tc.name == "both req and used are nil" {
+				result := getContainerAllocation(tc.req, tc.used, tc.allocationType)
+				if result[0].Value != 0 {
+					t.Errorf("Expected value to be 0, but got %f", result[0].Value)
+				}
+				if time.Now().UTC().Unix()-int64(result[0].Timestamp) > 5 {
+					t.Errorf("Expected timestamp to be recent, but it was not")
+				}
+				return
 			}
-			if ramAllocation[0].Timestamp != c.expectedRAMAllocation[0].Timestamp {
-				t.Errorf("RAM Allocation mismatch. Expected Timestamp: %f. Got: %f", ramAllocation[0].Timestamp, c.expectedRAMAllocation[0].Timestamp)
+			result := getContainerAllocation(tc.req, tc.used, tc.allocationType)
+			if diff := cmp.Diff(tc.expected, result); diff != "" {
+				t.Errorf("getContainerAllocation() mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}