Просмотр исходного кода

CostModel.ComputeAllocation: query clean-up; fix GPUs; address TODOs

Niko Kovacevic 5 лет назад
Родитель
Сommit
9ba5a4bae9

+ 105 - 60
pkg/costmodel/allocation.go

@@ -12,13 +12,13 @@ import (
 	"k8s.io/apimachinery/pkg/labels"
 )
 
-// TODO niko/cdmr NodeProp issue
+// TODO niko/computeallocation NodeProp issue
 // http://kubecost.nikovacevic.io/model/allocation?window=yesterday => Error: NodeProp not set
 
-// TODO niko/cdmr split into required and optional queries?
+// TODO niko/computeallocation split into required and optional queries?
 
-// TODO niko/cdmr move to pkg/kubecost
-// TODO niko/cdmr add PersistenVolumeClaims to type Allocation?
+// TODO niko/computeallocation move to pkg/kubecost
+// TODO niko/computeallocation add PersistenVolumeClaims to type Allocation?
 type PVC struct {
 	Bytes     float64   `json:"bytes"`
 	Count     int       `json:"count"`
@@ -56,10 +56,10 @@ func (pvc *PVC) String() string {
 	return fmt.Sprintf("%s/%s/%s{Bytes:%.2f, Cost:%.6f, Start,End:%s}", pvc.Cluster, pvc.Namespace, pvc.Name, pvc.Bytes, pvc.Cost(), kubecost.NewWindow(&pvc.Start, &pvc.End))
 }
 
-// TODO niko/cdmr move to pkg/kubecost
+// TODO niko/computeallocation move to pkg/kubecost
 type PV struct {
 	Bytes          float64 `json:"bytes"`
-	CostPerGiBHour float64 `json:"costPerGiBHour"` // TODO niko/cdmr GiB or GB?
+	CostPerGiBHour float64 `json:"costPerGiBHour"` // TODO niko/computeallocation GiB or GB?
 	Cluster        string  `json:"cluster"`
 	Name           string  `json:"name"`
 	StorageClass   string  `json:"storageClass"`
@@ -72,6 +72,41 @@ func (pv *PV) String() string {
 	return fmt.Sprintf("%s/%s{Bytes:%.2f, Cost/GiB*Hr:%.6f, StorageClass:%s}", pv.Cluster, pv.Name, pv.Bytes, pv.CostPerGiBHour, pv.StorageClass)
 }
 
+const (
+	queryFmtMinutes               = `avg(kube_pod_container_status_running{}) by (container, pod, namespace, cluster_id)[%s:%s]%s`
+	queryFmtRAMBytesAllocated     = `avg(avg_over_time(container_memory_allocation_bytes{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`
+	queryFmtRAMRequests           = `avg(avg_over_time(kube_pod_container_resource_requests_memory_bytes{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`
+	queryFmtRAMUsage              = `avg(avg_over_time(container_memory_working_set_bytes{container_name!="", container_name!="POD", instance!=""}[%s]%s)) by (container_name, pod_name, namespace, instance, cluster_id)`
+	queryFmtCPUCoresAllocated     = `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`
+	queryFmtCPURequests           = `avg(avg_over_time(kube_pod_container_resource_requests_cpu_cores{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`
+	queryFmtCPUUsage              = `avg(rate(container_cpu_usage_seconds_total{container_name!="", container_name!="POD", instance!=""}[%s]%s)) by (container_name, pod_name, namespace, instance, cluster_id)`
+	queryFmtGPUsRequested         = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`
+	queryFmtNodeCostPerCPUHr      = `avg(avg_over_time(node_cpu_hourly_cost[%s]%s)) by (node, cluster_id, instance_type)`
+	queryFmtNodeCostPerRAMGiBHr   = `avg(avg_over_time(node_ram_hourly_cost[%s]%s)) by (node, cluster_id, instance_type)`
+	queryFmtNodeCostPerGPUHr      = `avg(avg_over_time(node_gpu_hourly_cost[%s]%s)) by (node, cluster_id, instance_type)`
+	queryFmtNodeIsSpot            = `avg_over_time(kubecost_node_is_spot[%s]%s)`
+	queryFmtPVCInfo               = `avg(kube_persistentvolumeclaim_info{volumename != ""}) by (persistentvolumeclaim, storageclass, volumename, namespace, cluster_id)[%s:%s]%s`
+	queryFmtPVBytes               = `avg(avg_over_time(kube_persistentvolume_capacity_bytes[%s]%s)) by (persistentvolume, cluster_id)`
+	queryFmtPodPVCAllocation      = `avg(avg_over_time(pod_pvc_allocation[%s]%s)) by (persistentvolume, persistentvolumeclaim, pod, namespace, cluster_id)`
+	queryFmtPVCBytesRequested     = `avg(avg_over_time(kube_persistentvolumeclaim_resource_requests_storage_bytes{}[%s]%s)) by (persistentvolumeclaim, namespace, cluster_id)`
+	queryFmtPVCostPerGiBHour      = `avg(avg_over_time(pv_hourly_cost[%s]%s)) by (volumename, cluster_id)`
+	queryFmtNetZoneGiB            = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="true"}[%s]%s)) by (pod_name, namespace, cluster_id) / 1024 / 1024 / 1024`
+	queryFmtNetZoneCostPerGiB     = `avg(avg_over_time(kubecost_network_zone_egress_cost{}[%s]%s)) by (cluster_id)`
+	queryFmtNetRegionGiB          = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="false"}[%s]%s)) by (pod_name, namespace, cluster_id) / 1024 / 1024 / 1024`
+	queryFmtNetRegionCostPerGiB   = `avg(avg_over_time(kubecost_network_region_egress_cost{}[%s]%s)) by (cluster_id)`
+	queryFmtNetInternetGiB        = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true"}[%s]%s)) by (pod_name, namespace, cluster_id) / 1024 / 1024 / 1024`
+	queryFmtNetInternetCostPerGiB = `avg(avg_over_time(kubecost_network_internet_egress_cost{}[%s]%s)) by (cluster_id)`
+	queryFmtNamespaceLabels       = `avg_over_time(kube_namespace_labels[%s]%s)`
+	queryFmtNamespaceAnnotations  = `avg_over_time(kube_namespace_annotations[%s]%s)`
+	queryFmtPodLabels             = `avg_over_time(kube_pod_labels[%s]%s)`
+	queryFmtPodAnnotations        = `avg_over_time(kube_pod_annotations[%s]%s)`
+	queryFmtServiceLabels         = `avg_over_time(service_selector_labels[%s]%s)`
+	queryFmtDeploymentLabels      = `avg_over_time(deployment_match_labels[%s]%s)`
+	queryFmtStatefulSetLabels     = `avg_over_time(statefulSet_match_labels[%s]%s)`
+	queryFmtDaemonSetLabels       = `sum(avg_over_time(kube_pod_owner{owner_kind="DaemonSet"}[%s]%s)) by (pod, owner_name, namespace, cluster_id)`
+	queryFmtJobLabels             = `sum(avg_over_time(kube_pod_owner{owner_kind="Job"}[%s]%s)) by (pod, owner_name, namespace ,cluster_id)`
+)
+
 // ComputeAllocation uses the CostModel instance to compute an AllocationSet
 // for the window defined by the given start and end times. The Allocations
 // returned are unaggregated (i.e. down to the container level).
@@ -97,7 +132,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time) (*kubecost.Allocati
 	// If using Thanos, increase offset to 3 hours, reducing the duration by
 	// equal measure to maintain the same starting point.
 	thanosDur := thanos.OffsetDuration()
-	// TODO niko/cdmr confirm that this flag works interchangeably with ThanosClient != nil
+	// TODO niko/computeallocation confirm that this flag works interchangeably with ThanosClient != nil
 	if offset < thanosDur && env.IsThanosEnabled() {
 		diff := thanosDur - offset
 		offset += diff
@@ -123,117 +158,116 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time) (*kubecost.Allocati
 		offStr = ""
 	}
 
-	// TODO niko/cdmr dynamic resolution? add to ComputeAllocation() in allocation.Source?
+	// TODO niko/computeallocation dynamic resolution? add to ComputeAllocation() in allocation.Source?
 	resStr := "1m"
 	// resPerHr := 60
 
-	// TODO niko/cdmr remove after testing
+	// TODO niko/computeallocation remove after testing
 	startQuerying := time.Now()
 
 	ctx := prom.NewContext(cm.PrometheusClient)
 
-	// TODO niko/cdmr retries? (That should probably go into the Store.)
+	// TODO niko/computeallocation retries? (That should probably go into the Store.)
 
 	// TODO niko/cmdr check: will multiple Prometheus jobs multiply the totals?
 
-	// TODO niko/cdmr should we try doing this without resolution? Could yield
+	// TODO niko/computeallocation should we try doing this without resolution? Could yield
 	// more accurate results, but might also be more challenging in some
 	// respects; e.g. "correcting" the start point by what amount?
-	queryMinutes := fmt.Sprintf(`avg(kube_pod_container_status_running{}) by (container, pod, namespace, cluster_id)[%s:%s]%s`, durStr, resStr, offStr)
+	queryMinutes := fmt.Sprintf(queryFmtMinutes, durStr, resStr, offStr)
 	resChMinutes := ctx.Query(queryMinutes)
 
-	queryRAMBytesAllocated := fmt.Sprintf(`avg(avg_over_time(container_memory_allocation_bytes{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`, durStr, offStr)
+	queryRAMBytesAllocated := fmt.Sprintf(queryFmtRAMBytesAllocated, durStr, offStr)
 	resChRAMBytesAllocated := ctx.Query(queryRAMBytesAllocated)
 
-	queryRAMRequests := fmt.Sprintf(`avg(avg_over_time(kube_pod_container_resource_requests_memory_bytes{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`, durStr, offStr)
+	queryRAMRequests := fmt.Sprintf(queryFmtRAMRequests, durStr, offStr)
 	resChRAMRequests := ctx.Query(queryRAMRequests)
 
-	queryRAMUsage := fmt.Sprintf(`avg(avg_over_time(container_memory_working_set_bytes{container_name!="", container_name!="POD", instance!=""}[%s]%s)) by (container_name, pod_name, namespace, instance, cluster_id)`, durStr, offStr)
+	queryRAMUsage := fmt.Sprintf(queryFmtRAMUsage, durStr, offStr)
 	resChRAMUsage := ctx.Query(queryRAMUsage)
 
-	queryCPUCoresAllocated := fmt.Sprintf(`avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`, durStr, offStr)
+	queryCPUCoresAllocated := fmt.Sprintf(queryFmtCPUCoresAllocated, durStr, offStr)
 	resChCPUCoresAllocated := ctx.Query(queryCPUCoresAllocated)
 
-	queryCPURequests := fmt.Sprintf(`avg(avg_over_time(kube_pod_container_resource_requests_cpu_cores{container!="", container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`, durStr, offStr)
+	queryCPURequests := fmt.Sprintf(queryFmtCPURequests, durStr, offStr)
 	resChCPURequests := ctx.Query(queryCPURequests)
 
-	queryCPUUsage := fmt.Sprintf(`avg(rate(container_cpu_usage_seconds_total{container_name!="", container_name!="POD", instance!=""}[%s]%s)) by (container_name, pod_name, namespace, instance, cluster_id)`, durStr, offStr)
+	queryCPUUsage := fmt.Sprintf(queryFmtCPUUsage, durStr, offStr)
 	resChCPUUsage := ctx.Query(queryCPUUsage)
 
-	// TODO niko/cdmr find an env with GPUs to test this (generate one?)
-	queryGPUsRequested := fmt.Sprintf(`avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!=""}[%s]%s)) by (container, pod, namespace, node, cluster_id)`, durStr, offStr)
+	queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, durStr, offStr)
 	resChGPUsRequested := ctx.Query(queryGPUsRequested)
 
-	queryNodeCostPerCPUHr := fmt.Sprintf(`avg(avg_over_time(node_cpu_hourly_cost[%s]%s)) by (node, cluster_id, instance_type)`, durStr, offStr)
+	queryNodeCostPerCPUHr := fmt.Sprintf(queryFmtNodeCostPerCPUHr, durStr, offStr)
 	resChNodeCostPerCPUHr := ctx.Query(queryNodeCostPerCPUHr)
 
-	queryNodeCostPerRAMGiBHr := fmt.Sprintf(`avg(avg_over_time(node_ram_hourly_cost[%s]%s)) by (node, cluster_id, instance_type)`, durStr, offStr)
+	queryNodeCostPerRAMGiBHr := fmt.Sprintf(queryFmtNodeCostPerRAMGiBHr, durStr, offStr)
 	resChNodeCostPerRAMGiBHr := ctx.Query(queryNodeCostPerRAMGiBHr)
 
-	queryNodeCostPerGPUHr := fmt.Sprintf(`avg(avg_over_time(node_gpu_hourly_cost[%s]%s)) by (node, cluster_id, instance_type)`, durStr, offStr)
+	queryNodeCostPerGPUHr := fmt.Sprintf(queryFmtNodeCostPerGPUHr, durStr, offStr)
 	resChNodeCostPerGPUHr := ctx.Query(queryNodeCostPerGPUHr)
 
-	queryNodeIsSpot := fmt.Sprintf(`avg_over_time(kubecost_node_is_spot[%s]%s)`, durStr, offStr)
+	queryNodeIsSpot := fmt.Sprintf(queryFmtNodeIsSpot, durStr, offStr)
 	resChNodeIsSpot := ctx.Query(queryNodeIsSpot)
 
-	queryPVCInfo := fmt.Sprintf(`avg(kube_persistentvolumeclaim_info{volumename != ""}) by (persistentvolumeclaim, storageclass, volumename, namespace, cluster_id)[%s:%s]%s`, durStr, resStr, offStr)
+	queryPVCInfo := fmt.Sprintf(queryFmtPVCInfo, durStr, resStr, offStr)
 	resChPVCInfo := ctx.Query(queryPVCInfo)
 
-	queryPVBytes := fmt.Sprintf(`avg(avg_over_time(kube_persistentvolume_capacity_bytes[%s]%s)) by (persistentvolume, cluster_id)`, durStr, offStr)
+	queryPVBytes := fmt.Sprintf(queryFmtPVBytes, durStr, offStr)
 	resChPVBytes := ctx.Query(queryPVBytes)
 
-	queryPodPVCAllocation := fmt.Sprintf(`avg(avg_over_time(pod_pvc_allocation[%s]%s)) by (persistentvolume, persistentvolumeclaim, pod, namespace, cluster_id)`, durStr, offStr)
+	queryPodPVCAllocation := fmt.Sprintf(queryFmtPodPVCAllocation, durStr, offStr)
 	resChPodPVCAllocation := ctx.Query(queryPodPVCAllocation)
 
-	queryPVCBytesRequested := fmt.Sprintf(`avg(avg_over_time(kube_persistentvolumeclaim_resource_requests_storage_bytes{}[%s]%s)) by (persistentvolumeclaim, namespace, cluster_id)`, durStr, offStr)
+	queryPVCBytesRequested := fmt.Sprintf(queryFmtPVCBytesRequested, durStr, offStr)
 	resChPVCBytesRequested := ctx.Query(queryPVCBytesRequested)
 
-	queryPVCostPerGiBHour := fmt.Sprintf(`avg(avg_over_time(pv_hourly_cost[%s]%s)) by (volumename, cluster_id)`, durStr, offStr)
+	queryPVCostPerGiBHour := fmt.Sprintf(queryFmtPVCostPerGiBHour, durStr, offStr)
 	resChPVCostPerGiBHour := ctx.Query(queryPVCostPerGiBHour)
 
-	queryNetZoneGiB := fmt.Sprintf(`sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="true"}[%s]%s)) by (pod_name, namespace, cluster_id) / 1024 / 1024 / 1024`, durStr, offStr)
+	queryNetZoneGiB := fmt.Sprintf(queryFmtNetZoneGiB, durStr, offStr)
 	resChNetZoneGiB := ctx.Query(queryNetZoneGiB)
 
-	queryNetZoneCostPerGiB := fmt.Sprintf(`avg(avg_over_time(kubecost_network_zone_egress_cost{}[%s]%s)) by (cluster_id)`, durStr, offStr)
+	queryNetZoneCostPerGiB := fmt.Sprintf(queryFmtNetZoneCostPerGiB, durStr, offStr)
 	resChNetZoneCostPerGiB := ctx.Query(queryNetZoneCostPerGiB)
 
-	queryNetRegionGiB := fmt.Sprintf(`sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="false"}[%s]%s)) by (pod_name, namespace, cluster_id) / 1024 / 1024 / 1024`, durStr, offStr)
+	queryNetRegionGiB := fmt.Sprintf(queryFmtNetRegionGiB, durStr, offStr)
 	resChNetRegionGiB := ctx.Query(queryNetRegionGiB)
 
-	queryNetRegionCostPerGiB := fmt.Sprintf(`avg(avg_over_time(kubecost_network_region_egress_cost{}[%s]%s)) by (cluster_id)`, durStr, offStr)
+	queryNetRegionCostPerGiB := fmt.Sprintf(queryFmtNetRegionCostPerGiB, durStr, offStr)
 	resChNetRegionCostPerGiB := ctx.Query(queryNetRegionCostPerGiB)
 
-	queryNetInternetGiB := fmt.Sprintf(`sum(increase(kubecost_pod_network_egress_bytes_total{internet="true"}[%s]%s)) by (pod_name, namespace, cluster_id) / 1024 / 1024 / 1024`, durStr, offStr)
+	queryNetInternetGiB := fmt.Sprintf(queryFmtNetInternetGiB, durStr, offStr)
 	resChNetInternetGiB := ctx.Query(queryNetInternetGiB)
 
-	queryNetInternetCostPerGiB := fmt.Sprintf(`avg(avg_over_time(kubecost_network_internet_egress_cost{}[%s]%s)) by (cluster_id)`, durStr, offStr)
+	queryNetInternetCostPerGiB := fmt.Sprintf(queryFmtNetInternetCostPerGiB, durStr, offStr)
 	resChNetInternetCostPerGiB := ctx.Query(queryNetInternetCostPerGiB)
 
-	queryNamespaceLabels := fmt.Sprintf(`avg_over_time(kube_namespace_labels[%s]%s)`, durStr, offStr)
+	queryNamespaceLabels := fmt.Sprintf(queryFmtNamespaceLabels, durStr, offStr)
 	resChNamespaceLabels := ctx.Query(queryNamespaceLabels)
 
-	queryNamespaceAnnotations := fmt.Sprintf(`avg_over_time(kube_namespace_annotations[%s]%s)`, durStr, offStr)
+	queryNamespaceAnnotations := fmt.Sprintf(queryFmtNamespaceAnnotations, durStr, offStr)
 	resChNamespaceAnnotations := ctx.Query(queryNamespaceAnnotations)
 
-	queryPodLabels := fmt.Sprintf(`avg_over_time(kube_pod_labels[%s]%s)`, durStr, offStr)
+	queryPodLabels := fmt.Sprintf(queryFmtPodLabels, durStr, offStr)
 	resChPodLabels := ctx.Query(queryPodLabels)
 
-	queryPodAnnotations := fmt.Sprintf(`avg_over_time(kube_pod_annotations[%s]%s)`, durStr, offStr)
+	queryPodAnnotations := fmt.Sprintf(queryFmtPodAnnotations, durStr, offStr)
 	resChPodAnnotations := ctx.Query(queryPodAnnotations)
 
-	queryServiceLabels := fmt.Sprintf(`avg_over_time(service_selector_labels[%s]%s)`, durStr, offStr)
+	queryServiceLabels := fmt.Sprintf(queryFmtServiceLabels, durStr, offStr)
 	resChServiceLabels := ctx.Query(queryServiceLabels)
 
-	queryDeploymentLabels := fmt.Sprintf(`avg_over_time(deployment_match_labels[%s]%s)`, durStr, offStr)
+	queryDeploymentLabels := fmt.Sprintf(queryFmtDeploymentLabels, durStr, offStr)
 	resChDeploymentLabels := ctx.Query(queryDeploymentLabels)
 
-	queryStatefulSetLabels := fmt.Sprintf(`avg_over_time(statefulSet_match_labels[%s]%s)`, durStr, offStr)
+	queryStatefulSetLabels := fmt.Sprintf(queryFmtStatefulSetLabels, durStr, offStr)
 	resChStatefulSetLabels := ctx.Query(queryStatefulSetLabels)
 
-	queryDaemonSetLabels := fmt.Sprintf(`sum(avg_over_time(kube_pod_owner{owner_kind="DaemonSet"}[%s]%s)) by (pod, owner_name, namespace, cluster_id)`, durStr, offStr)
+	queryDaemonSetLabels := fmt.Sprintf(queryFmtDaemonSetLabels, durStr, offStr)
 	resChDaemonSetLabels := ctx.Query(queryDaemonSetLabels)
 
-	queryJobLabels := fmt.Sprintf(`sum(avg_over_time(kube_pod_owner{owner_kind="Job"}[%s]%s)) by (pod, owner_name, namespace ,cluster_id)`, durStr, offStr)
+	queryJobLabels := fmt.Sprintf(queryFmtJobLabels, durStr, offStr)
 	resChJobLabels := ctx.Query(queryJobLabels)
 
 	resMinutes, _ := resChMinutes.Await()
@@ -275,7 +309,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time) (*kubecost.Allocati
 	resJobLabels, _ := resChJobLabels.Await()
 
 	// ----------------------------------------------------------------------//
-	// TODO niko/cdmr remove all logs after testing
+	// TODO niko/computeallocation remove all logs after testing
 
 	// log.Infof("CostModel.ComputeAllocation: minutes  : %s", queryMinutes)
 
@@ -321,7 +355,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time) (*kubecost.Allocati
 	// Build out a map of Allocations, starting with (start, end) so that we
 	// begin with minutes, from which we compute resource allocation and cost
 	// totals from measured rate data.
-	// TODO niko/cdmr can we start with a reasonable guess at map size?
+	// TODO niko/computeallocation can we start with a reasonable guess at map size?
 	allocationMap := map[containerKey]*kubecost.Allocation{}
 
 	// Keep track of the allocations per pod, for the sake of splitting PVC and
@@ -347,7 +381,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time) (*kubecost.Allocati
 	applyNetworkAllocation(allocationMap, podAllocation, resNetRegionGiB, resNetRegionCostPerGiB)
 	applyNetworkAllocation(allocationMap, podAllocation, resNetInternetGiB, resNetInternetCostPerGiB)
 
-	// TODO niko/cdmr pruneDuplicateData? (see costmodel.go)
+	// TODO niko/computeallocation pruneDuplicateData? (see costmodel.go)
 
 	namespaceLabels := resToNamespaceLabels(resNamespaceLabels)
 	podLabels := resToPodLabels(resPodLabels)
@@ -368,7 +402,7 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time) (*kubecost.Allocati
 	applyControllersToPods(allocationMap, podDaemonSetMap)
 	applyControllersToPods(allocationMap, podJobMap)
 
-	// TODO niko/cdmr breakdown network costs?
+	// TODO niko/computeallocation breakdown network costs?
 
 	// Build out a map of Nodes with resource costs, discounts, and node types
 	// for converting resource allocation data to cumulative costs.
@@ -380,18 +414,18 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time) (*kubecost.Allocati
 	applyNodeSpot(nodeMap, resNodeIsSpot)
 	applyNodeDiscount(nodeMap, cm)
 
-	// TODO niko/cdmr comment
+	// TODO niko/computeallocation comment
 	pvMap := map[pvKey]*PV{}
 	buildPVMap(pvMap, resPVCostPerGiBHour)
 	applyPVBytes(pvMap, resPVBytes)
-	// TODO niko/cdmr apply PV bytes?
+	// TODO niko/computeallocation apply PV bytes?
 
-	// TODO niko/cdmr comment
+	// TODO niko/computeallocation comment
 	pvcMap := map[pvcKey]*PVC{}
 	buildPVCMap(window, pvcMap, pvMap, resPVCInfo)
 	applyPVCBytesRequested(pvcMap, resPVCBytesRequested)
 
-	// TODO niko/cdmr comment
+	// TODO niko/computeallocation comment
 	podPVCMap := map[podKey][]*PVC{}
 	buildPodPVCMap(podPVCMap, pvMap, pvcMap, podAllocation, resPodPVCAllocation)
 
@@ -436,7 +470,14 @@ func (cm *CostModel) ComputeAllocation(start, end time.Time) (*kubecost.Allocati
 				gib := pvc.Bytes / 1024 / 1024 / 1024
 
 				alloc.PVByteHours += pvc.Bytes * hrs
-				alloc.PVCost += pvc.Volume.CostPerGiBHour * gib * hrs / float64(pvc.Count)
+
+				count := float64(pvc.Count)
+				if pvc.Count < 1 {
+					// TODO niko/computeallocation why is this happening?
+					log.Warningf("CostModel.ComputeAllocation: PVC.Count=%d for %s", pvc.Count, alloc.Name)
+					count = 1
+				}
+				alloc.PVCost += pvc.Volume.CostPerGiBHour * gib * hrs / count
 			}
 		}
 
@@ -596,7 +637,7 @@ func applyCPUCoresRequested(allocationMap map[containerKey]*kubecost.Allocation,
 
 		// CPU allocation is less than requests, so set CPUCoreHours to
 		// request level.
-		// TODO niko/cdmr why is this happening?
+		// TODO niko/computeallocation why is this happening?
 		if allocationMap[key].CPUCores() < res.Values[0].Value {
 			allocationMap[key].CPUCoreHours = res.Values[0].Value * (allocationMap[key].Minutes() / 60.0)
 		}
@@ -672,7 +713,7 @@ func applyRAMBytesRequested(allocationMap map[containerKey]*kubecost.Allocation,
 
 		// RAM allocation is less than requests, so set RAMByteHours to
 		// request level.
-		// TODO niko/cdmr why is this happening?
+		// TODO niko/computeallocation why is this happening?
 		if allocationMap[key].RAMBytes() < res.Values[0].Value {
 			allocationMap[key].RAMByteHours = res.Values[0].Value * (allocationMap[key].Minutes() / 60.0)
 		}
@@ -718,8 +759,11 @@ func applyGPUsRequested(allocationMap map[containerKey]*kubecost.Allocation, res
 			continue
 		}
 
-		// TODO niko/cdmr complete
+		// TODO niko/computeallocation remove log
 		log.Infof("CostModel.ComputeAllocation: GPU results: %s=%f", key, res.Values[0].Value)
+
+		hrs := allocationMap[key].Minutes() / 60.0
+		allocationMap[key].GPUHours = res.Values[0].Value * hrs
 	}
 }
 
@@ -971,7 +1015,7 @@ func labelsToPodControllerMap(podLabels map[podKey]map[string]string, controller
 
 			podLabelSet := labels.Set(pLabels)
 			if selector.Matches(podLabelSet) {
-				// TODO niko/cdmr does this need to be one-to-many? In that case, we'd
+				// TODO niko/computeallocation does this need to be one-to-many? In that case, we'd
 				// need a different Allocation schema
 				if _, ok := podControllerMap[pKey]; ok {
 					log.Warningf("CostModel.ComputeAllocation: PodControllerMap match already exists: %s matches %s and %s", pKey, podControllerMap[pKey], cKey)
@@ -1218,7 +1262,7 @@ func applyNodeDiscount(nodeMap map[nodeKey]*Node, cm *CostModel) {
 	}
 
 	for _, node := range nodeMap {
-		// TODO niko/cdmr take RI into account?
+		// TODO niko/computeallocation take RI into account?
 		node.Discount = cm.Provider.CombinedDiscountForNode(node.NodeType, node.Preemptible, discount, negotiatedDiscount)
 		node.CostPerCPUHr *= (1.0 - node.Discount)
 		node.CostPerRAMGiBHr *= (1.0 - node.Discount)
@@ -1379,6 +1423,7 @@ func buildPodPVCMap(podPVCMap map[podKey][]*PVC, pvMap map[pvKey]*PV, pvcMap map
 			continue
 		}
 
+		// TODO niko/computeallocation is this working?
 		pvc.Count = len(podAllocation[podKey])
 
 		podPVCMap[podKey] = append(podPVCMap[podKey], pvc)
@@ -1413,8 +1458,8 @@ func applyUnmountedPVs(window kubecost.Window, allocationMap map[containerKey]*k
 	for cluster, amount := range unmountedPVCost {
 		container := "unmounted-pvs"
 		pod := "unmounted-pvs"
-		namespace := "" // TODO niko/cdmr what about this?
-		node := ""      // TODO niko/cdmr what about this?
+		namespace := "" // TODO niko/computeallocation what about this?
+		node := ""      // TODO niko/computeallocation what about this?
 
 		containerKey := newContainerKey(cluster, namespace, pod, container)
 		allocationMap[containerKey] = &kubecost.Allocation{

+ 3 - 3
pkg/costmodel/costmodel.go

@@ -46,7 +46,7 @@ const (
 // isCron matches a CronJob name and captures the non-timestamp name
 var isCron = regexp.MustCompile(`^(.+)-\d{10}$`)
 
-// TODO niko/cdmr both Thanos and Prometheus, or just one?
+// TODO niko/computeallocation both Thanos and Prometheus, or just one?
 type CostModel struct {
 	Cache            clustercache.ClusterCache
 	ClusterMap       clusters.ClusterMap
@@ -71,7 +71,7 @@ func NewCostModel(client prometheus.Client, provider costAnalyzerCloud.Provider,
 	}
 }
 
-// TODO niko/cdmr
+// TODO niko/computeallocation
 type ContainerAllocation struct {
 	Properties      ContainerProperties          `json:"properties"`
 	RAMReq          []*util.Vector               `json:"ramreq,omitempty"`
@@ -90,7 +90,7 @@ type ContainerAllocation struct {
 	ClusterName     string                       `json:"clusterName"`
 }
 
-// TODO niko/cdmr
+// TODO niko/computeallocation
 type ContainerProperties struct {
 	Container      string            `json:"container"`
 	Pod            string            `json:"pod"`

+ 4 - 4
pkg/kubecost/allocation.go

@@ -43,7 +43,7 @@ const ShareNone = "__none__"
 // Allocation is a unit of resource allocation and cost for a given window
 // of time and for a given kubernetes construct with its associated set of
 // properties.
-// TODO niko/cdmr compute efficiency on the fly?
+// TODO niko/computeallocation compute efficiency on the fly?
 type Allocation struct {
 	Name                   string     `json:"name"`
 	Properties             Properties `json:"properties,omitempty"`
@@ -253,7 +253,7 @@ func (a *Allocation) MarshalJSON() ([]byte, error) {
 	return buffer.Bytes(), nil
 }
 
-// TODO niko/cdmr
+// TODO niko/computeallocation
 // func (a *Allocation)UnmarshalJSON()
 
 // Resolution returns the duration of time covered by the Allocation
@@ -290,7 +290,7 @@ func (a *Allocation) Minutes() float64 {
 
 // Share works like Add, but converts the entire cost of the given Allocation
 // to SharedCost, rather than adding to the individual resource costs.
-// TODO niko/cdmr unit test changes!!!
+// TODO niko/computeallocation unit test changes!!!
 func (a *Allocation) Share(that *Allocation) (*Allocation, error) {
 	if that == nil {
 		return a.Clone(), nil
@@ -1602,7 +1602,7 @@ func (as *AllocationSet) accumulate(that *AllocationSet) (*AllocationSet, error)
 		return as, nil
 	}
 
-	// TODO niko/cdmr implement first
+	// TODO niko/computeallocation implement first
 	// if that.Window.Overlaps(as.Window) {
 	// 	return nil, fmt.Errorf("AllocationSet.accumulate: overlapping windows: %s", that.Window, as.Window)
 	// }

+ 1 - 3
pkg/kubecost/json.go

@@ -11,10 +11,8 @@ import (
 
 func jsonEncodeFloat64(buffer *bytes.Buffer, name string, val float64, comma string) {
 	var encoding string
-	if math.IsNaN(val) {
+	if math.IsNaN(val) || math.IsInf(val, 0) {
 		encoding = fmt.Sprintf("\"%s\":null%s", name, comma)
-	} else if math.IsInf(val, 0) {
-		encoding = fmt.Sprintf("\"%s\":\"inf\"%s", name, comma)
 	} else {
 		encoding = fmt.Sprintf("\"%s\":%f%s", name, val, comma)
 	}

+ 3 - 3
pkg/kubecost/window.go

@@ -446,7 +446,7 @@ func (w Window) Minutes() float64 {
 
 // Overlaps returns true iff the two given Windows share and amount of temporal
 // coverage.
-// TODO niko/cdmr return to this, with unit tests, and then implement in
+// TODO niko/computeallocation return to this, with unit tests, and then implement in
 // AllocationSet.accumulate
 func (w Window) Overlaps(x Window) bool {
 	if (w.start == nil && w.end == nil) || (x.start == nil && x.end == nil) {
@@ -495,11 +495,11 @@ func (w Window) Overlaps(x Window) bool {
 	//  [------]
 	//     ?------?
 	if x.start == nil {
-		// TODO niko/cdmr
+		// TODO niko/computeallocation
 	}
 
 	if x.end == nil {
-		// TODO niko/cdmr
+		// TODO niko/computeallocation
 	}
 
 	// Both are closed.

+ 1 - 1
pkg/kubecost/window_test.go

@@ -602,7 +602,7 @@ func TestWindow_DurationOffsetStrings(t *testing.T) {
 }
 
 func TestWindow_Overlaps(t *testing.T) {
-	// TODO niko/cdmr
+	// TODO niko/computeallocation
 }
 
 // TODO