|
@@ -28,6 +28,7 @@ const (
|
|
|
queryFmtCPURequests = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
|
|
queryFmtCPURequests = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
|
|
|
queryFmtCPUUsageAvg = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
|
|
queryFmtCPUUsageAvg = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
|
|
|
queryFmtGPUsRequested = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
|
|
queryFmtGPUsRequested = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
|
|
|
|
|
+ queryFmtGPUsUsageAvg = `avg(avg_over_time(DCGM_FI_DEV_GPU_UTIL{container!=""}[%s])) by (container, pod, namespace, %s)`
|
|
|
queryFmtGPUsAllocated = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
|
|
queryFmtGPUsAllocated = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
|
|
|
queryFmtNodeCostPerCPUHr = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
|
|
queryFmtNodeCostPerCPUHr = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
|
|
|
queryFmtNodeCostPerRAMGiBHr = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
|
|
queryFmtNodeCostPerRAMGiBHr = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
|
|
@@ -428,6 +429,9 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
|
|
|
queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
resChGPUsRequested := ctx.QueryAtTime(queryGPUsRequested, end)
|
|
resChGPUsRequested := ctx.QueryAtTime(queryGPUsRequested, end)
|
|
|
|
|
|
|
|
|
|
+ queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, env.GetPromClusterLabel())
|
|
|
|
|
+ resChGPUsUsageAvg := ctx.Query(queryGPUsUsageAvg)
|
|
|
|
|
+
|
|
|
queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
|
|
|
resChGPUsAllocated := ctx.QueryAtTime(queryGPUsAllocated, end)
|
|
resChGPUsAllocated := ctx.QueryAtTime(queryGPUsAllocated, end)
|
|
|
|
|
|
|
@@ -544,6 +548,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
|
|
|
resRAMUsageAvg, _ := resChRAMUsageAvg.Await()
|
|
resRAMUsageAvg, _ := resChRAMUsageAvg.Await()
|
|
|
resRAMUsageMax, _ := resChRAMUsageMax.Await()
|
|
resRAMUsageMax, _ := resChRAMUsageMax.Await()
|
|
|
resGPUsRequested, _ := resChGPUsRequested.Await()
|
|
resGPUsRequested, _ := resChGPUsRequested.Await()
|
|
|
|
|
+ resGPUsUsageAvg, _ := resChGPUsUsageAvg.Await()
|
|
|
resGPUsAllocated, _ := resChGPUsAllocated.Await()
|
|
resGPUsAllocated, _ := resChGPUsAllocated.Await()
|
|
|
|
|
|
|
|
resNodeCostPerCPUHr, _ := resChNodeCostPerCPUHr.Await()
|
|
resNodeCostPerCPUHr, _ := resChNodeCostPerCPUHr.Await()
|
|
@@ -610,6 +615,7 @@ func (cm *CostModel) computeAllocation(start, end time.Time, resolution time.Dur
|
|
|
applyRAMBytesRequested(podMap, resRAMRequests, podUIDKeyMap)
|
|
applyRAMBytesRequested(podMap, resRAMRequests, podUIDKeyMap)
|
|
|
applyRAMBytesUsedAvg(podMap, resRAMUsageAvg, podUIDKeyMap)
|
|
applyRAMBytesUsedAvg(podMap, resRAMUsageAvg, podUIDKeyMap)
|
|
|
applyRAMBytesUsedMax(podMap, resRAMUsageMax, podUIDKeyMap)
|
|
applyRAMBytesUsedMax(podMap, resRAMUsageMax, podUIDKeyMap)
|
|
|
|
|
+ applyGPUUsageAvg(podMap, resGPUsUsageAvg, podUIDKeyMap)
|
|
|
applyGPUsAllocated(podMap, resGPUsRequested, resGPUsAllocated, podUIDKeyMap)
|
|
applyGPUsAllocated(podMap, resGPUsRequested, resGPUsAllocated, podUIDKeyMap)
|
|
|
applyNetworkTotals(podMap, resNetTransferBytes, resNetReceiveBytes, podUIDKeyMap)
|
|
applyNetworkTotals(podMap, resNetTransferBytes, resNetReceiveBytes, podUIDKeyMap)
|
|
|
applyNetworkAllocation(podMap, resNetZoneGiB, resNetZoneCostPerGiB, podUIDKeyMap, networkCrossZoneCost)
|
|
applyNetworkAllocation(podMap, resNetZoneGiB, resNetZoneCostPerGiB, podUIDKeyMap, networkCrossZoneCost)
|