| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184 |
- package count
- // Description - Checks for the aggregate count of pods for each namespace from prometheus request
- // and allocation API request are the same
- // Both prometheus and allocation seem to be returning duplicate results. Does this we might be double counting costs times?
- import (
- // "fmt"
- "slices"
- "sort"
- "strings"
- "testing"
- "time"
- "github.com/opencost/opencost-integration-tests/pkg/api"
- "github.com/opencost/opencost-integration-tests/pkg/prometheus"
- )
- func TestQueryAllocation(t *testing.T) {
- apiObj := api.NewAPI()
- testCases := []struct {
- name string
- window string
- aggregate string
- accumulate string
- }{
- {
- name: "Yesterday",
- window: "24h",
- aggregate: "pod",
- accumulate: "false",
- },
- }
- t.Logf("testCases: %v", testCases)
- for _, tc := range testCases {
- t.Run(tc.name, func(t *testing.T) {
- // API Client
- apiResponse, err := apiObj.GetAllocation(api.AllocationRequest{
- Window: tc.window,
- Aggregate: tc.aggregate,
- Accumulate: tc.accumulate,
- })
- if err != nil {
- t.Fatalf("Error while calling Allocation API %v", err)
- }
- if apiResponse.Code != 200 {
- t.Errorf("API returned non-200 code")
- }
- queryEnd := time.Now().UTC().Truncate(time.Hour).Add(time.Hour)
- endTime := queryEnd.Unix()
- // Prometheus Client
- // Want to Run avg(avg_over_time(kube_pod_container_status_running[24h]) != 0) by (container, pod, namespace)
- // Running avg(avg_over_time(kube_pod_container_status_running[24h])) by (container, pod, namespace)
- client := prometheus.NewClient()
- promInput := prometheus.PrometheusInput{
- Metric: "kube_pod_container_status_running",
- // MetricNotEqualTo: "0",
- Function: []string{"avg_over_time", "avg"},
- QueryWindow: tc.window,
- AggregateBy: []string{"container", "pod", "namespace"},
- Time: &endTime,
- }
- promResponse, err := client.RunPromQLQuery(promInput, t)
- if err != nil {
- t.Fatalf("Error while calling Prometheus API %v", err)
- }
- // Narrow the Prometheus pod set to pods alive at the query
- // endTime using a 1m-resolution subquery. Without this,
- // pods that were only very briefly running inside the 24h
- // window show up in Prometheus (as their avg_over_time is
- // non-zero) but are absent from /allocation, which only
- // reports pods with coincident usage samples. That is a
- // window-boundary race, not a pod-count bug.
- promAliveInput := prometheus.PrometheusInput{
- Metric: "kube_pod_container_status_running",
- MetricNotEqualTo: "0",
- Function: []string{"avg"},
- AggregateBy: []string{"container", "pod", "namespace", "node"},
- AggregateWindow: tc.window,
- AggregateResolution: "1m",
- Time: &endTime,
- }
- promAliveResponse, err := client.RunPromQLQuery(promAliveInput, t)
- if err != nil {
- t.Fatalf("Error while calling Prometheus API %v", err)
- }
- alivePods := make(map[string]bool)
- for _, metric := range promAliveResponse.Data.Result {
- alivePods[metric.Metric.Pod] = true
- }
- // Calculate Number of Pods per Aggregate for API Object
- type podAggregation struct {
- Pods []string
- }
- // Namespace based calculation
- var apiAggregateCount = make(map[string]*podAggregation)
- for pod, allocationResponeItem := range apiResponse.Data[0] {
- // Synthetic value generated and returned by /allocation and not /prometheus
- if slices.Contains([]string{"prometheus-system-unmounted-pvcs", "network-load-gen-unmounted-pvcs"}, pod) {
- continue
- }
- podNamespace := allocationResponeItem.Properties.Namespace
- apiAggregateItem, namespacePresent := apiAggregateCount[podNamespace]
- if !namespacePresent {
- apiAggregateCount[podNamespace] = &podAggregation{
- Pods: []string{pod},
- }
- continue
- }
- if allocationResponeItem.Properties.Pod == "" {
- continue
- }
- if !slices.Contains(apiAggregateItem.Pods, pod) {
- apiAggregateItem.Pods = append(apiAggregateItem.Pods, pod)
- }
- }
- // Calculate Number of Pods per Aggregate for Prom Object
- var promAggregateCount = make(map[string]*podAggregation)
- for _, metric := range promResponse.Data.Result {
- podNamespace := metric.Metric.Namespace
- pod := metric.Metric.Pod
- // This pod was down, unable to do it with the query
- if metric.Value.Value == 0 {
- continue
- }
- // Skip pods that are not alive at the query end time.
- // /allocation only returns pods with usage data in the
- // window, so short-lived pods that were up earlier in
- // the 24h window but not at endTime would otherwise
- // produce spurious mismatches.
- if !alivePods[pod] {
- continue
- }
- promAggregateItem, namespacePresent := promAggregateCount[podNamespace]
- if !namespacePresent {
- promAggregateCount[podNamespace] = &podAggregation{
- Pods: []string{pod},
- }
- continue
- }
- if !slices.Contains(promAggregateItem.Pods, pod) {
- promAggregateItem.Pods = append(promAggregateItem.Pods, pod)
- }
- }
- if len(promAggregateCount) != len(apiAggregateCount) {
- t.Logf("Namespace Count Allocation %d != Prometheus %d", len(apiAggregateCount), len(promAggregateCount))
- }
- for namespace, _ := range promAggregateCount {
- apiNamespaceCount, apiNamespacePresent := apiAggregateCount[namespace]
- promNamespaceCount, promNamespacePresent := promAggregateCount[namespace]
- if apiNamespacePresent && promNamespacePresent {
- t.Logf("Namespace: %s", namespace)
- sort.Strings(apiNamespaceCount.Pods)
- sort.Strings(promNamespaceCount.Pods)
- if len(apiNamespaceCount.Pods) != len(promNamespaceCount.Pods) {
- t.Errorf("[Fail]: /allocation (%d) != Prometheus (%d)", len(apiNamespaceCount.Pods), len(promNamespaceCount.Pods))
- t.Errorf("API Pods:\n - %v\nPrometheus Pods:\n - %v", strings.Join(apiNamespaceCount.Pods, "\n - "), strings.Join(promNamespaceCount.Pods, "\n - "))
- } else {
- t.Logf("[Pass]: Pod Count %d", len(apiNamespaceCount.Pods))
- }
- } else {
- t.Errorf("Namespace Missing: Prometheus(%v), allocation API(%v)", apiNamespacePresent, promNamespacePresent)
- }
- }
- })
- }
- }
|