Selaa lähdekoodia

Merge branch 'develop' into alex-core-135-unified-asset

Alex Meijer 3 vuotta sitten
vanhempi
sitoutus
fa204086d3
5 muutettua tiedostoa jossa 131 lisäystä ja 60 poistoa
  1. 35 0
      SECURITY.md
  2. 14 5
      pkg/cloud/provider.go
  3. 51 46
      pkg/costmodel/allocation.go
  4. 25 9
      pkg/costmodel/allocation_helpers.go
  5. 6 0
      pkg/costmodel/key.go

+ 35 - 0
SECURITY.md

@@ -0,0 +1,35 @@
+# OpenCost Security Policy
+
+The OpenCost project greatly appreciates the need for security and timely updates, given our proximity to cloud billing. We are very grateful to the users, security researchers, and developers for reporting security vulnerabilities to us. All reported security vulnerabilities will be carefully assessed, addressed, and responded to.
+
+## Code Security
+
+Application code is version controlled using GitHub. All code changes are tracked with full revision history and are attributable to a specific individual. Code must be reviewed and accepted by a different engineer than the author of the change.
+
+### Dependabot
+
+OpenCost has [Dependabot](https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-supply-chain-security#what-is-dependabot) enabled for assessing dependencies in the project.
+
+## Supported Versions
+
+OpenCost provides security updates for the two most recent minor versions released on GitHub.
+
+For example, if `v1.102.0` is the most recent stable version, we will address security updates for `v1.101.0` and later. Once `v1.103.0` is released, we will no longer provide updates for `v1.101.x` releases.
+
+## Reporting a Vulnerability
+
+The OpenCost project has enabled [Private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing/privately-reporting-a-security-vulnerability) for our repositories which allows for direct reporting of issues to administrators and maintainers in a secure fashion. Please include a thorough description of the issue, the steps you took to create the issue, affected versions, and, if known, mitigations for the issue. The team will help diagnose the severity of the issue and determine how to address the issue. Issues deemed to be non-critical will be filed as GitHub issues. Critical issues will receive immediate attention and be fixed as quickly as possible.
+
+### Kubecost Bug Bounty
+
+Kubecost offers a Bug Bounty program that pays $250 USD for unique, not previously disclosed publicly available CVEs, and accepted security bug reports submitted to vulnerability-report@kubecost.com.
+
+## Disclosure policy
+
+For known public security vulnerabilities, we will disclose the disclosure as soon as possible after receiving the report. Vulnerabilities discovered for the first time will be disclosed in accordance with the following process:
+
+1. The received security vulnerability report shall be handed over to the security team for follow-up coordination and repair work.
+2. After the vulnerability is confirmed, we will create a draft Security Advisory on GitHub that lists the details of the vulnerability.
+3. Invite related personnel to discuss the fix.
+4. Fork the temporary private repository on GitHub, and collaborate to fix the vulnerability.
+5. After the fixed code is merged into all supported versions, the vulnerability will be publicly posted in the GitHub Advisory Database.

+ 14 - 5
pkg/cloud/provider.go

@@ -4,9 +4,8 @@ import (
 	"database/sql"
 	"database/sql"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
-	"golang.org/x/text/cases"
-	"golang.org/x/text/language"
 	"io"
 	"io"
+	"net"
 	"net/http"
 	"net/http"
 	"regexp"
 	"regexp"
 	"strconv"
 	"strconv"
@@ -14,6 +13,9 @@ import (
 	"sync"
 	"sync"
 	"time"
 	"time"
 
 
+	"golang.org/x/text/cases"
+	"golang.org/x/text/language"
+
 	"github.com/opencost/opencost/pkg/kubecost"
 	"github.com/opencost/opencost/pkg/kubecost"
 
 
 	"github.com/opencost/opencost/pkg/util"
 	"github.com/opencost/opencost/pkg/util"
@@ -506,9 +508,16 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string, config *config.
 			clusterRegion:    cp.region,
 			clusterRegion:    cp.region,
 			clusterAccountID: cp.accountID,
 			clusterAccountID: cp.accountID,
 			clusterProjectID: cp.projectID,
 			clusterProjectID: cp.projectID,
-			metadataClient: metadata.NewClient(&http.Client{
-				Transport: httputil.NewUserAgentTransport("kubecost", http.DefaultTransport),
-			}),
+			metadataClient: metadata.NewClient(
+				&http.Client{
+					Transport: httputil.NewUserAgentTransport("kubecost", &http.Transport{
+						Dial: (&net.Dialer{
+							Timeout:   2 * time.Second,
+							KeepAlive: 30 * time.Second,
+						}).Dial,
+					}),
+					Timeout: 5 * time.Second,
+				}),
 		}, nil
 		}, nil
 	case kubecost.AWSProvider:
 	case kubecost.AWSProvider:
 		log.Info("Found ProviderID starting with \"aws\", using AWS Provider")
 		log.Info("Found ProviderID starting with \"aws\", using AWS Provider")

+ 51 - 46
pkg/costmodel/allocation.go

@@ -13,51 +13,52 @@ import (
 )
 )
 
 
 const (
 const (
-	queryFmtPods                     = `avg(kube_pod_container_status_running{}) by (pod, namespace, %s)[%s:%s]`
-	queryFmtPodsUID                  = `avg(kube_pod_container_status_running{}) by (pod, namespace, uid, %s)[%s:%s]`
-	queryFmtRAMBytesAllocated        = `avg(avg_over_time(container_memory_allocation_bytes{container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s, provider_id)`
-	queryFmtRAMRequests              = `avg(avg_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
-	queryFmtRAMUsageAvg              = `avg(avg_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD"}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
-	queryFmtRAMUsageMax              = `max(max_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD"}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
-	queryFmtCPUCoresAllocated        = `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
-	queryFmtCPURequests              = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
-	queryFmtCPUUsageAvg              = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD"}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
-	queryFmtGPUsRequested            = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
-	queryFmtGPUsAllocated            = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
-	queryFmtNodeCostPerCPUHr         = `avg(avg_over_time(node_cpu_hourly_cost[%s])) by (node, %s, instance_type, provider_id)`
-	queryFmtNodeCostPerRAMGiBHr      = `avg(avg_over_time(node_ram_hourly_cost[%s])) by (node, %s, instance_type, provider_id)`
-	queryFmtNodeCostPerGPUHr         = `avg(avg_over_time(node_gpu_hourly_cost[%s])) by (node, %s, instance_type, provider_id)`
-	queryFmtNodeIsSpot               = `avg_over_time(kubecost_node_is_spot[%s])`
-	queryFmtPVCInfo                  = `avg(kube_persistentvolumeclaim_info{volumename != ""}) by (persistentvolumeclaim, storageclass, volumename, namespace, %s)[%s:%s]`
-	queryFmtPodPVCAllocation         = `avg(avg_over_time(pod_pvc_allocation[%s])) by (persistentvolume, persistentvolumeclaim, pod, namespace, %s)`
-	queryFmtPVCBytesRequested        = `avg(avg_over_time(kube_persistentvolumeclaim_resource_requests_storage_bytes{}[%s])) by (persistentvolumeclaim, namespace, %s)`
-	queryFmtPVActiveMins             = `count(kube_persistentvolume_capacity_bytes) by (persistentvolume, %s)[%s:%s]`
-	queryFmtPVBytes                  = `avg(avg_over_time(kube_persistentvolume_capacity_bytes[%s])) by (persistentvolume, %s)`
-	queryFmtPVCostPerGiBHour         = `avg(avg_over_time(pv_hourly_cost[%s])) by (volumename, %s)`
-	queryFmtNetZoneGiB               = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="true"}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
-	queryFmtNetZoneCostPerGiB        = `avg(avg_over_time(kubecost_network_zone_egress_cost{}[%s])) by (%s)`
-	queryFmtNetRegionGiB             = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="false"}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
-	queryFmtNetRegionCostPerGiB      = `avg(avg_over_time(kubecost_network_region_egress_cost{}[%s])) by (%s)`
-	queryFmtNetInternetGiB           = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true"}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
-	queryFmtNetInternetCostPerGiB    = `avg(avg_over_time(kubecost_network_internet_egress_cost{}[%s])) by (%s)`
-	queryFmtNetReceiveBytes          = `sum(increase(container_network_receive_bytes_total{pod!=""}[%s])) by (pod_name, pod, namespace, %s)`
-	queryFmtNetTransferBytes         = `sum(increase(container_network_transmit_bytes_total{pod!=""}[%s])) by (pod_name, pod, namespace, %s)`
-	queryFmtNodeLabels               = `avg_over_time(kube_node_labels[%s])`
-	queryFmtNamespaceLabels          = `avg_over_time(kube_namespace_labels[%s])`
-	queryFmtNamespaceAnnotations     = `avg_over_time(kube_namespace_annotations[%s])`
-	queryFmtPodLabels                = `avg_over_time(kube_pod_labels[%s])`
-	queryFmtPodAnnotations           = `avg_over_time(kube_pod_annotations[%s])`
-	queryFmtServiceLabels            = `avg_over_time(service_selector_labels[%s])`
-	queryFmtDeploymentLabels         = `avg_over_time(deployment_match_labels[%s])`
-	queryFmtStatefulSetLabels        = `avg_over_time(statefulSet_match_labels[%s])`
-	queryFmtDaemonSetLabels          = `sum(avg_over_time(kube_pod_owner{owner_kind="DaemonSet"}[%s])) by (pod, owner_name, namespace, %s)`
-	queryFmtJobLabels                = `sum(avg_over_time(kube_pod_owner{owner_kind="Job"}[%s])) by (pod, owner_name, namespace ,%s)`
-	queryFmtPodsWithReplicaSetOwner  = `sum(avg_over_time(kube_pod_owner{owner_kind="ReplicaSet"}[%s])) by (pod, owner_name, namespace ,%s)`
-	queryFmtReplicaSetsWithoutOwners = `avg(avg_over_time(kube_replicaset_owner{owner_kind="<none>", owner_name="<none>"}[%s])) by (replicaset, namespace, %s)`
-	queryFmtLBCostPerHr              = `avg(avg_over_time(kubecost_load_balancer_cost[%s])) by (namespace, service_name, %s)`
-	queryFmtLBActiveMins             = `count(kubecost_load_balancer_cost) by (namespace, service_name, %s)[%s:%s]`
-	queryFmtOldestSample             = `min_over_time(timestamp(group(node_cpu_hourly_cost))[%s:%s])`
-	queryFmtNewestSample             = `max_over_time(timestamp(group(node_cpu_hourly_cost))[%s:%s])`
+	queryFmtPods                        = `avg(kube_pod_container_status_running{}) by (pod, namespace, %s)[%s:%s]`
+	queryFmtPodsUID                     = `avg(kube_pod_container_status_running{}) by (pod, namespace, uid, %s)[%s:%s]`
+	queryFmtRAMBytesAllocated           = `avg(avg_over_time(container_memory_allocation_bytes{container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s, provider_id)`
+	queryFmtRAMRequests                 = `avg(avg_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
+	queryFmtRAMUsageAvg                 = `avg(avg_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD"}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
+	queryFmtRAMUsageMax                 = `max(max_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD"}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
+	queryFmtCPUCoresAllocated           = `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
+	queryFmtCPURequests                 = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
+	queryFmtCPUUsageAvg                 = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD"}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
+	queryFmtGPUsRequested               = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
+	queryFmtGPUsAllocated               = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!=""}[%s])) by (container, pod, namespace, node, %s)`
+	queryFmtNodeCostPerCPUHr            = `avg(avg_over_time(node_cpu_hourly_cost[%s])) by (node, %s, instance_type, provider_id)`
+	queryFmtNodeCostPerRAMGiBHr         = `avg(avg_over_time(node_ram_hourly_cost[%s])) by (node, %s, instance_type, provider_id)`
+	queryFmtNodeCostPerGPUHr            = `avg(avg_over_time(node_gpu_hourly_cost[%s])) by (node, %s, instance_type, provider_id)`
+	queryFmtNodeIsSpot                  = `avg_over_time(kubecost_node_is_spot[%s])`
+	queryFmtPVCInfo                     = `avg(kube_persistentvolumeclaim_info{volumename != ""}) by (persistentvolumeclaim, storageclass, volumename, namespace, %s)[%s:%s]`
+	queryFmtPodPVCAllocation            = `avg(avg_over_time(pod_pvc_allocation[%s])) by (persistentvolume, persistentvolumeclaim, pod, namespace, %s)`
+	queryFmtPVCBytesRequested           = `avg(avg_over_time(kube_persistentvolumeclaim_resource_requests_storage_bytes{}[%s])) by (persistentvolumeclaim, namespace, %s)`
+	queryFmtPVActiveMins                = `count(kube_persistentvolume_capacity_bytes) by (persistentvolume, %s)[%s:%s]`
+	queryFmtPVBytes                     = `avg(avg_over_time(kube_persistentvolume_capacity_bytes[%s])) by (persistentvolume, %s)`
+	queryFmtPVCostPerGiBHour            = `avg(avg_over_time(pv_hourly_cost[%s])) by (volumename, %s)`
+	queryFmtNetZoneGiB                  = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="true"}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
+	queryFmtNetZoneCostPerGiB           = `avg(avg_over_time(kubecost_network_zone_egress_cost{}[%s])) by (%s)`
+	queryFmtNetRegionGiB                = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="false"}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
+	queryFmtNetRegionCostPerGiB         = `avg(avg_over_time(kubecost_network_region_egress_cost{}[%s])) by (%s)`
+	queryFmtNetInternetGiB              = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true"}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
+	queryFmtNetInternetCostPerGiB       = `avg(avg_over_time(kubecost_network_internet_egress_cost{}[%s])) by (%s)`
+	queryFmtNetReceiveBytes             = `sum(increase(container_network_receive_bytes_total{pod!="", container="POD"}[%s])) by (pod_name, pod, namespace, %s)`
+	queryFmtNetTransferBytes            = `sum(increase(container_network_transmit_bytes_total{pod!="", container="POD"}[%s])) by (pod_name, pod, namespace, %s)`
+	queryFmtNodeLabels                  = `avg_over_time(kube_node_labels[%s])`
+	queryFmtNamespaceLabels             = `avg_over_time(kube_namespace_labels[%s])`
+	queryFmtNamespaceAnnotations        = `avg_over_time(kube_namespace_annotations[%s])`
+	queryFmtPodLabels                   = `avg_over_time(kube_pod_labels[%s])`
+	queryFmtPodAnnotations              = `avg_over_time(kube_pod_annotations[%s])`
+	queryFmtServiceLabels               = `avg_over_time(service_selector_labels[%s])`
+	queryFmtDeploymentLabels            = `avg_over_time(deployment_match_labels[%s])`
+	queryFmtStatefulSetLabels           = `avg_over_time(statefulSet_match_labels[%s])`
+	queryFmtDaemonSetLabels             = `sum(avg_over_time(kube_pod_owner{owner_kind="DaemonSet"}[%s])) by (pod, owner_name, namespace, %s)`
+	queryFmtJobLabels                   = `sum(avg_over_time(kube_pod_owner{owner_kind="Job"}[%s])) by (pod, owner_name, namespace ,%s)`
+	queryFmtPodsWithReplicaSetOwner     = `sum(avg_over_time(kube_pod_owner{owner_kind="ReplicaSet"}[%s])) by (pod, owner_name, namespace ,%s)`
+	queryFmtReplicaSetsWithoutOwners    = `avg(avg_over_time(kube_replicaset_owner{owner_kind="<none>", owner_name="<none>"}[%s])) by (replicaset, namespace, %s)`
+	queryFmtReplicaSetsWithRolloutOwner = `avg(avg_over_time(kube_replicaset_owner{owner_kind="Rollout"}[%s])) by (replicaset, namespace, owner_kind, owner_name, %s)`
+	queryFmtLBCostPerHr                 = `avg(avg_over_time(kubecost_load_balancer_cost[%s])) by (namespace, service_name, %s)`
+	queryFmtLBActiveMins                = `count(kubecost_load_balancer_cost) by (namespace, service_name, %s)[%s:%s]`
+	queryFmtOldestSample                = `min_over_time(timestamp(group(node_cpu_hourly_cost))[%s:%s])`
+	queryFmtNewestSample                = `max_over_time(timestamp(group(node_cpu_hourly_cost))[%s:%s])`
 
 
 
 
 	// Because we use container_cpu_usage_seconds_total to calculate CPU usage
 	// Because we use container_cpu_usage_seconds_total to calculate CPU usage
@@ -504,6 +505,9 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	queryReplicaSetsWithoutOwners := fmt.Sprintf(queryFmtReplicaSetsWithoutOwners, durStr, env.GetPromClusterLabel())
 	queryReplicaSetsWithoutOwners := fmt.Sprintf(queryFmtReplicaSetsWithoutOwners, durStr, env.GetPromClusterLabel())
 	resChReplicaSetsWithoutOwners := ctx.QueryAtTime(queryReplicaSetsWithoutOwners, end)
 	resChReplicaSetsWithoutOwners := ctx.QueryAtTime(queryReplicaSetsWithoutOwners, end)
 
 
+	queryReplicaSetsWithRolloutOwner := fmt.Sprintf(queryFmtReplicaSetsWithRolloutOwner, durStr, env.GetPromClusterLabel())
+	resChReplicaSetsWithRolloutOwner := ctx.QueryAtTime(queryReplicaSetsWithRolloutOwner, end)
+
 	queryJobLabels := fmt.Sprintf(queryFmtJobLabels, durStr, env.GetPromClusterLabel())
 	queryJobLabels := fmt.Sprintf(queryFmtJobLabels, durStr, env.GetPromClusterLabel())
 	resChJobLabels := ctx.QueryAtTime(queryJobLabels, end)
 	resChJobLabels := ctx.QueryAtTime(queryJobLabels, end)
 
 
@@ -561,6 +565,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	resDaemonSetLabels, _ := resChDaemonSetLabels.Await()
 	resDaemonSetLabels, _ := resChDaemonSetLabels.Await()
 	resPodsWithReplicaSetOwner, _ := resChPodsWithReplicaSetOwner.Await()
 	resPodsWithReplicaSetOwner, _ := resChPodsWithReplicaSetOwner.Await()
 	resReplicaSetsWithoutOwners, _ := resChReplicaSetsWithoutOwners.Await()
 	resReplicaSetsWithoutOwners, _ := resChReplicaSetsWithoutOwners.Await()
+	resReplicaSetsWithRolloutOwner, _ := resChReplicaSetsWithRolloutOwner.Await()
 	resJobLabels, _ := resChJobLabels.Await()
 	resJobLabels, _ := resChJobLabels.Await()
 	resLBCostPerHr, _ := resChLBCostPerHr.Await()
 	resLBCostPerHr, _ := resChLBCostPerHr.Await()
 	resLBActiveMins, _ := resChLBActiveMins.Await()
 	resLBActiveMins, _ := resChLBActiveMins.Await()
@@ -616,7 +621,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
 	podStatefulSetMap := labelsToPodControllerMap(podLabels, resToStatefulSetLabels(resStatefulSetLabels))
 	podStatefulSetMap := labelsToPodControllerMap(podLabels, resToStatefulSetLabels(resStatefulSetLabels))
 	podDaemonSetMap := resToPodDaemonSetMap(resDaemonSetLabels, podUIDKeyMap, ingestPodUID)
 	podDaemonSetMap := resToPodDaemonSetMap(resDaemonSetLabels, podUIDKeyMap, ingestPodUID)
 	podJobMap := resToPodJobMap(resJobLabels, podUIDKeyMap, ingestPodUID)
 	podJobMap := resToPodJobMap(resJobLabels, podUIDKeyMap, ingestPodUID)
-	podReplicaSetMap := resToPodReplicaSetMap(resPodsWithReplicaSetOwner, resReplicaSetsWithoutOwners, podUIDKeyMap, ingestPodUID)
+	podReplicaSetMap := resToPodReplicaSetMap(resPodsWithReplicaSetOwner, resReplicaSetsWithoutOwners, resReplicaSetsWithRolloutOwner, podUIDKeyMap, ingestPodUID)
 	applyControllersToPods(podMap, podDeploymentMap)
 	applyControllersToPods(podMap, podDeploymentMap)
 	applyControllersToPods(podMap, podStatefulSetMap)
 	applyControllersToPods(podMap, podStatefulSetMap)
 	applyControllersToPods(podMap, podDaemonSetMap)
 	applyControllersToPods(podMap, podDaemonSetMap)

+ 25 - 9
pkg/costmodel/allocation_helpers.go

@@ -1156,13 +1156,16 @@ func resToPodJobMap(resJobLabels []*prom.QueryResult, podUIDKeyMap map[podKey][]
 	return jobLabels
 	return jobLabels
 }
 }
 
 
-func resToPodReplicaSetMap(resPodsWithReplicaSetOwner []*prom.QueryResult, resReplicaSetsWithoutOwners []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey, ingestPodUID bool) map[podKey]controllerKey {
+func resToPodReplicaSetMap(resPodsWithReplicaSetOwner []*prom.QueryResult, resReplicaSetsWithoutOwners []*prom.QueryResult, resReplicaSetsWithRolloutOwner []*prom.QueryResult, podUIDKeyMap map[podKey][]podKey, ingestPodUID bool) map[podKey]controllerKey {
 	// Build out set of ReplicaSets that have no owners, themselves, such that
 	// Build out set of ReplicaSets that have no owners, themselves, such that
 	// the ReplicaSet should be used as the owner of the Pods it controls.
 	// the ReplicaSet should be used as the owner of the Pods it controls.
 	// (This should exclude, for example, ReplicaSets that are controlled by
 	// (This should exclude, for example, ReplicaSets that are controlled by
 	// Deployments, in which case the Deployment should be the pod's owner.)
 	// Deployments, in which case the Deployment should be the pod's owner.)
+	// Additionally, add to this set of ReplicaSets those ReplicaSets that
+	// are owned by a Rollout
 	replicaSets := map[controllerKey]struct{}{}
 	replicaSets := map[controllerKey]struct{}{}
 
 
+	// Create unowned ReplicaSet controller keys
 	for _, res := range resReplicaSetsWithoutOwners {
 	for _, res := range resReplicaSetsWithoutOwners {
 		controllerKey, err := resultReplicaSetKey(res, env.GetPromClusterLabel(), "namespace", "replicaset")
 		controllerKey, err := resultReplicaSetKey(res, env.GetPromClusterLabel(), "namespace", "replicaset")
 		if err != nil {
 		if err != nil {
@@ -1172,17 +1175,34 @@ func resToPodReplicaSetMap(resPodsWithReplicaSetOwner []*prom.QueryResult, resRe
 		replicaSets[controllerKey] = struct{}{}
 		replicaSets[controllerKey] = struct{}{}
 	}
 	}
 
 
+	// Create Rollout-owned ReplicaSet controller keys
+	for _, res := range resReplicaSetsWithRolloutOwner {
+		controllerKey, err := resultReplicaSetRolloutKey(res, env.GetPromClusterLabel(), "namespace", "replicaset")
+		if err != nil {
+			continue
+		}
+
+		replicaSets[controllerKey] = struct{}{}
+	}
+
 	// Create the mapping of Pods to ReplicaSets, ignoring any ReplicaSets that
 	// Create the mapping of Pods to ReplicaSets, ignoring any ReplicaSets that
-	// to not appear in the set of uncontrolled ReplicaSets above.
+	// do not appear in the set of unowned/Rollout-owned ReplicaSets above.
 	podToReplicaSet := map[podKey]controllerKey{}
 	podToReplicaSet := map[podKey]controllerKey{}
 
 
 	for _, res := range resPodsWithReplicaSetOwner {
 	for _, res := range resPodsWithReplicaSetOwner {
+		// First, check if this pod is owned by an unowned ReplicaSet
 		controllerKey, err := resultReplicaSetKey(res, env.GetPromClusterLabel(), "namespace", "owner_name")
 		controllerKey, err := resultReplicaSetKey(res, env.GetPromClusterLabel(), "namespace", "owner_name")
 		if err != nil {
 		if err != nil {
 			continue
 			continue
-		}
-		if _, ok := replicaSets[controllerKey]; !ok {
-			continue
+		} else if _, ok := replicaSets[controllerKey]; !ok {
+			// If the pod is not owned by an unowned ReplicaSet, check if
+			// it's owned by a Rollout-owned ReplicaSet
+			controllerKey, err = resultReplicaSetRolloutKey(res, env.GetPromClusterLabel(), "namespace", "owner_name")
+			if err != nil {
+				continue
+			} else if _, ok := replicaSets[controllerKey]; !ok {
+				continue
+			}
 		}
 		}
 
 
 		pod, err := res.GetString("pod")
 		pod, err := res.GetString("pod")
@@ -1196,18 +1216,14 @@ func resToPodReplicaSetMap(resPodsWithReplicaSetOwner []*prom.QueryResult, resRe
 
 
 		if ingestPodUID {
 		if ingestPodUID {
 			if uidKeys, ok := podUIDKeyMap[key]; ok {
 			if uidKeys, ok := podUIDKeyMap[key]; ok {
-
 				keys = append(keys, uidKeys...)
 				keys = append(keys, uidKeys...)
-
 			}
 			}
 		} else {
 		} else {
 			keys = []podKey{key}
 			keys = []podKey{key}
 		}
 		}
 
 
 		for _, key := range keys {
 		for _, key := range keys {
-
 			podToReplicaSet[key] = controllerKey
 			podToReplicaSet[key] = controllerKey
-
 		}
 		}
 	}
 	}
 
 

+ 6 - 0
pkg/costmodel/key.go

@@ -243,6 +243,12 @@ func resultReplicaSetKey(res *prom.QueryResult, clusterLabel, namespaceLabel, co
 	return resultControllerKey("replicaset", res, clusterLabel, namespaceLabel, controllerLabel)
 	return resultControllerKey("replicaset", res, clusterLabel, namespaceLabel, controllerLabel)
 }
 }
 
 
+// resultReplicaSetRolloutKey creates a controllerKey for a Job.
+// (See resultControllerKey for more.)
+func resultReplicaSetRolloutKey(res *prom.QueryResult, clusterLabel, namespaceLabel, controllerLabel string) (controllerKey, error) {
+	return resultControllerKey("rollout", res, clusterLabel, namespaceLabel, controllerLabel)
+}
+
 type serviceKey struct {
 type serviceKey struct {
 	Cluster   string
 	Cluster   string
 	Namespace string
 	Namespace string