akillibulut
/
cost-model
镜像来自 https://github.com/kubecost/cost-model


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
							package count

// Description - Checks for the aggregate count of pods for each namespace from prometheus request
// and allocation API request are the same

// Both prometheus and allocation seem to be returning duplicate results. Does this we might be double counting costs times?

import (
	// "fmt"
	"slices"
	"sort"
	"strings"
	"testing"
	"time"

	"github.com/opencost/opencost-integration-tests/pkg/api"
	"github.com/opencost/opencost-integration-tests/pkg/prometheus"
)

func TestQueryAllocation(t *testing.T) {
	apiObj := api.NewAPI()

	testCases := []struct {
		name       string
		window     string
		aggregate  string
		accumulate string
	}{
		{
			name:       "Yesterday",
			window:     "24h",
			aggregate:  "pod",
			accumulate: "false",
		},
	}

	t.Logf("testCases: %v", testCases)

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {

			// API Client
			apiResponse, err := apiObj.GetAllocation(api.AllocationRequest{
				Window:     tc.window,
				Aggregate:  tc.aggregate,
				Accumulate: tc.accumulate,
			})

			if err != nil {
				t.Fatalf("Error while calling Allocation API %v", err)
			}
			if apiResponse.Code != 200 {
				t.Errorf("API returned non-200 code")
			}

			queryEnd := time.Now().UTC().Truncate(time.Hour).Add(time.Hour)
			endTime := queryEnd.Unix()

			// Prometheus Client
			// Want to Run avg(avg_over_time(kube_pod_container_status_running[24h]) != 0) by (container, pod, namespace)
			// Running avg(avg_over_time(kube_pod_container_status_running[24h])) by (container, pod, namespace)
			client := prometheus.NewClient()
			promInput := prometheus.PrometheusInput{
				Metric: "kube_pod_container_status_running",
				// MetricNotEqualTo: "0",
				Function:    []string{"avg_over_time", "avg"},
				QueryWindow: tc.window,
				AggregateBy: []string{"container", "pod", "namespace"},
				Time:        &endTime,
			}

			promResponse, err := client.RunPromQLQuery(promInput, t)
			if err != nil {
				t.Fatalf("Error while calling Prometheus API %v", err)
			}

			// Narrow the Prometheus pod set to pods alive at the query
			// endTime using a 1m-resolution subquery. Without this,
			// pods that were only very briefly running inside the 24h
			// window show up in Prometheus (as their avg_over_time is
			// non-zero) but are absent from /allocation, which only
			// reports pods with coincident usage samples. That is a
			// window-boundary race, not a pod-count bug.
			promAliveInput := prometheus.PrometheusInput{
				Metric:              "kube_pod_container_status_running",
				MetricNotEqualTo:    "0",
				Function:            []string{"avg"},
				AggregateBy:         []string{"container", "pod", "namespace", "node"},
				AggregateWindow:     tc.window,
				AggregateResolution: "1m",
				Time:                &endTime,
			}

			promAliveResponse, err := client.RunPromQLQuery(promAliveInput, t)
			if err != nil {
				t.Fatalf("Error while calling Prometheus API %v", err)
			}

			alivePods := make(map[string]bool)
			for _, metric := range promAliveResponse.Data.Result {
				alivePods[metric.Metric.Pod] = true
			}

			// Calculate Number of Pods per Aggregate for API Object
			type podAggregation struct {
				Pods []string
			}
			// Namespace based calculation
			var apiAggregateCount = make(map[string]*podAggregation)

			for pod, allocationResponeItem := range apiResponse.Data[0] {
				// Synthetic value generated and returned by /allocation and not /prometheus
				if slices.Contains([]string{"prometheus-system-unmounted-pvcs", "network-load-gen-unmounted-pvcs"}, pod) {
					continue
				}
				podNamespace := allocationResponeItem.Properties.Namespace
				apiAggregateItem, namespacePresent := apiAggregateCount[podNamespace]
				if !namespacePresent {
					apiAggregateCount[podNamespace] = &podAggregation{
						Pods: []string{pod},
					}
					continue
				}
				if allocationResponeItem.Properties.Pod == "" {
					continue
				}
				if !slices.Contains(apiAggregateItem.Pods, pod) {
					apiAggregateItem.Pods = append(apiAggregateItem.Pods, pod)
				}
			}

			// Calculate Number of Pods per Aggregate for Prom Object
			var promAggregateCount = make(map[string]*podAggregation)

			for _, metric := range promResponse.Data.Result {
				podNamespace := metric.Metric.Namespace
				pod := metric.Metric.Pod
				// This pod was down, unable to do it with the query
				if metric.Value.Value == 0 {
					continue
				}
				// Skip pods that are not alive at the query end time.
				// /allocation only returns pods with usage data in the
				// window, so short-lived pods that were up earlier in
				// the 24h window but not at endTime would otherwise
				// produce spurious mismatches.
				if !alivePods[pod] {
					continue
				}
				promAggregateItem, namespacePresent := promAggregateCount[podNamespace]
				if !namespacePresent {
					promAggregateCount[podNamespace] = &podAggregation{
						Pods: []string{pod},
					}
					continue
				}
				if !slices.Contains(promAggregateItem.Pods, pod) {
					promAggregateItem.Pods = append(promAggregateItem.Pods, pod)
				}
			}

			if len(promAggregateCount) != len(apiAggregateCount) {
				t.Logf("Namespace Count Allocation %d != Prometheus %d", len(apiAggregateCount), len(promAggregateCount))
			}
			for namespace, _ := range promAggregateCount {
				apiNamespaceCount, apiNamespacePresent := apiAggregateCount[namespace]
				promNamespaceCount, promNamespacePresent := promAggregateCount[namespace]
				if apiNamespacePresent && promNamespacePresent {
					t.Logf("Namespace: %s", namespace)
					sort.Strings(apiNamespaceCount.Pods)
					sort.Strings(promNamespaceCount.Pods)
					if len(apiNamespaceCount.Pods) != len(promNamespaceCount.Pods) {
						t.Errorf("[Fail]: /allocation (%d) != Prometheus (%d)", len(apiNamespaceCount.Pods), len(promNamespaceCount.Pods))
						t.Errorf("API Pods:\n - %v\nPrometheus Pods:\n - %v", strings.Join(apiNamespaceCount.Pods, "\n - "), strings.Join(promNamespaceCount.Pods, "\n - "))
					} else {
						t.Logf("[Pass]: Pod Count %d", len(apiNamespaceCount.Pods))
					}
				} else {
					t.Errorf("Namespace Missing: Prometheus(%v), allocation API(%v)", apiNamespacePresent, promNamespacePresent)
				}
			}
		})
	}
}