metricsquerier.go 67 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399
  1. package prom
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/opencost/opencost/core/pkg/source"
  6. "github.com/opencost/opencost/core/pkg/util/timeutil"
  7. prometheus "github.com/prometheus/client_golang/api"
  8. )
  9. //--------------------------------------------------------------------------
  10. // PrometheusMetricsQuerier
  11. //--------------------------------------------------------------------------
  12. // PrometheusMetricsQuerier is the implementation of the data source's MetricsQuerier interface for Prometheus.
  13. type PrometheusMetricsQuerier struct {
  14. promConfig *OpenCostPrometheusConfig
  15. promClient prometheus.Client
  16. promContexts *ContextFactory
  17. }
  18. func newPrometheusMetricsQuerier(
  19. promConfig *OpenCostPrometheusConfig,
  20. promClient prometheus.Client,
  21. promContexts *ContextFactory,
  22. ) *PrometheusMetricsQuerier {
  23. return &PrometheusMetricsQuerier{
  24. promConfig: promConfig,
  25. promClient: promClient,
  26. promContexts: promContexts,
  27. }
  28. }
  29. func (pds *PrometheusMetricsQuerier) QueryPVPricePerGiBHour(start, end time.Time) *source.Future[source.PVPricePerGiBHourResult] {
  30. const pvCostQuery = `avg(avg_over_time(pv_hourly_cost{%s}[%s])) by (%s, persistentvolume, volumename, provider_id)`
  31. durStr := timeutil.DurationString(end.Sub(start))
  32. if durStr == "" {
  33. panic("failed to parse duration string passed to QueryPVCost")
  34. }
  35. queryPVCost := fmt.Sprintf(pvCostQuery, pds.promConfig.ClusterFilter, durStr, pds.promConfig.ClusterLabel)
  36. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  37. return source.NewFuture(source.DecodePVPricePerGiBHourResult, ctx.QueryAtTime(queryPVCost, end))
  38. }
  39. func (pds *PrometheusMetricsQuerier) QueryPVUsedAverage(start, end time.Time) *source.Future[source.PVUsedAvgResult] {
  40. // `avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  41. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  42. const pvUsedAverageQuery = `avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  43. cfg := pds.promConfig
  44. durStr := timeutil.DurationString(end.Sub(start))
  45. if durStr == "" {
  46. panic("failed to parse duration string passed to QueryPVUsedAverage")
  47. }
  48. queryPVUsedAvg := fmt.Sprintf(pvUsedAverageQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  49. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  50. return source.NewFuture(source.DecodePVUsedAvgResult, ctx.QueryAtTime(queryPVUsedAvg, end))
  51. }
  52. func (pds *PrometheusMetricsQuerier) QueryPVUsedMax(start, end time.Time) *source.Future[source.PVUsedMaxResult] {
  53. // `max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  54. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  55. const pvUsedMaxQuery = `max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  56. cfg := pds.promConfig
  57. durStr := timeutil.DurationString(end.Sub(start))
  58. if durStr == "" {
  59. panic("failed to parse duration string passed to QueryPVUsedMax")
  60. }
  61. queryPVUsedMax := fmt.Sprintf(pvUsedMaxQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  62. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  63. return source.NewFuture(source.DecodePVUsedMaxResult, ctx.QueryAtTime(queryPVUsedMax, end))
  64. }
  65. func (pds *PrometheusMetricsQuerier) QueryPVCInfo(start, end time.Time) *source.Future[source.PVCInfoResult] {
  66. const queryFmtPVCInfo = `avg(kube_persistentvolumeclaim_info{volumename != "", %s}) by (persistentvolumeclaim, storageclass, volumename, namespace, %s)[%s:%dm]`
  67. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  68. cfg := pds.promConfig
  69. minsPerResolution := cfg.DataResolutionMinutes
  70. durStr := pds.durationStringFor(start, end, minsPerResolution)
  71. if durStr == "" {
  72. panic("failed to parse duration string passed to QueryPVCInfo")
  73. }
  74. queryPVCInfo := fmt.Sprintf(queryFmtPVCInfo, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  75. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  76. return source.NewFuture(source.DecodePVCInfoResult, ctx.QueryAtTime(queryPVCInfo, end))
  77. }
  78. func (pds *PrometheusMetricsQuerier) QueryPVActiveMinutes(start, end time.Time) *source.Future[source.PVActiveMinutesResult] {
  79. const pvActiveMinsQuery = `avg(kube_persistentvolume_capacity_bytes{%s}) by (%s, persistentvolume)[%s:%dm]`
  80. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  81. cfg := pds.promConfig
  82. minsPerResolution := cfg.DataResolutionMinutes
  83. durStr := pds.durationStringFor(start, end, minsPerResolution)
  84. if durStr == "" {
  85. panic("failed to parse duration string passed to QueryPVActiveMinutes")
  86. }
  87. queryPVActiveMins := fmt.Sprintf(pvActiveMinsQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  88. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  89. return source.NewFuture(source.DecodePVActiveMinutesResult, ctx.QueryAtTime(queryPVActiveMins, end))
  90. }
  91. func (pds *PrometheusMetricsQuerier) QueryLocalStorageCost(start, end time.Time) *source.Future[source.LocalStorageCostResult] {
  92. // `sum_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  93. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  94. const localStorageCostQuery = `sum_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  95. cfg := pds.promConfig
  96. minsPerResolution := cfg.DataResolutionMinutes
  97. durStr := pds.durationStringFor(start, end, minsPerResolution)
  98. if durStr == "" {
  99. panic("failed to parse duration string passed to QueryLocalStorageCost")
  100. }
  101. // hourlyToCumulative is a scaling factor that, when multiplied by an
  102. // hourly value, converts it to a cumulative value; i.e. [$/hr] *
  103. // [min/res]*[hr/min] = [$/res]
  104. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  105. costPerGBHr := 0.04 / 730.0
  106. queryLocalStorageCost := fmt.Sprintf(localStorageCostQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  107. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  108. return source.NewFuture(source.DecodeLocalStorageCostResult, ctx.QueryAtTime(queryLocalStorageCost, end))
  109. }
  110. func (pds *PrometheusMetricsQuerier) QueryLocalStorageUsedCost(start, end time.Time) *source.Future[source.LocalStorageUsedCostResult] {
  111. // `sum_over_time(sum(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  112. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  113. const localStorageUsedCostQuery = `sum_over_time(sum(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  114. cfg := pds.promConfig
  115. minsPerResolution := cfg.DataResolutionMinutes
  116. durStr := pds.durationStringFor(start, end, minsPerResolution)
  117. if durStr == "" {
  118. panic("failed to parse duration string passed to QueryLocalStorageUsedCost")
  119. }
  120. // hourlyToCumulative is a scaling factor that, when multiplied by an
  121. // hourly value, converts it to a cumulative value; i.e. [$/hr] *
  122. // [min/res]*[hr/min] = [$/res]
  123. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  124. costPerGBHr := 0.04 / 730.0
  125. queryLocalStorageUsedCost := fmt.Sprintf(localStorageUsedCostQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  126. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  127. return source.NewFuture(source.DecodeLocalStorageUsedCostResult, ctx.QueryAtTime(queryLocalStorageUsedCost, end))
  128. }
  129. func (pds *PrometheusMetricsQuerier) QueryLocalStorageUsedAvg(start, end time.Time) *source.Future[source.LocalStorageUsedAvgResult] {
  130. // `avg(sum(avg_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  131. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  132. const localStorageUsedAvgQuery = `avg(sum(avg_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  133. cfg := pds.promConfig
  134. durStr := timeutil.DurationString(end.Sub(start))
  135. if durStr == "" {
  136. panic("failed to parse duration string passed to QueryLocalStorageUsedAvg")
  137. }
  138. queryLocalStorageUsedAvg := fmt.Sprintf(localStorageUsedAvgQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  139. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  140. return source.NewFuture(source.DecodeLocalStorageUsedAvgResult, ctx.QueryAtTime(queryLocalStorageUsedAvg, end))
  141. }
  142. func (pds *PrometheusMetricsQuerier) QueryLocalStorageUsedMax(start, end time.Time) *source.Future[source.LocalStorageUsedMaxResult] {
  143. // `max(sum(max_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  144. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  145. const localStorageUsedMaxQuery = `max(sum(max_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  146. cfg := pds.promConfig
  147. durStr := timeutil.DurationString(end.Sub(start))
  148. if durStr == "" {
  149. panic("failed to parse duration string passed to QueryLocalStorageUsedMax")
  150. }
  151. queryLocalStorageUsedMax := fmt.Sprintf(localStorageUsedMaxQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  152. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  153. return source.NewFuture(source.DecodeLocalStorageUsedMaxResult, ctx.QueryAtTime(queryLocalStorageUsedMax, end))
  154. }
  155. func (pds *PrometheusMetricsQuerier) QueryLocalStorageBytes(start, end time.Time) *source.Future[source.LocalStorageBytesResult] {
  156. // `avg_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm])`
  157. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  158. const localStorageBytesQuery = `avg_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm])`
  159. cfg := pds.promConfig
  160. minsPerResolution := cfg.DataResolutionMinutes
  161. durStr := pds.durationStringFor(start, end, minsPerResolution)
  162. if durStr == "" {
  163. panic("failed to parse duration string passed to QueryLocalStorageBytes")
  164. }
  165. queryLocalStorageBytes := fmt.Sprintf(localStorageBytesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  166. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  167. return source.NewFuture(source.DecodeLocalStorageBytesResult, ctx.QueryAtTime(queryLocalStorageBytes, end))
  168. }
  169. func (pds *PrometheusMetricsQuerier) QueryLocalStorageActiveMinutes(start, end time.Time) *source.Future[source.LocalStorageActiveMinutesResult] {
  170. // `count(node_total_hourly_cost{%s}) by (%s, node)[%s:%dm]`
  171. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  172. const localStorageActiveMinutesQuery = `count(node_total_hourly_cost{%s}) by (%s, node, instance, provider_id)[%s:%dm]`
  173. cfg := pds.promConfig
  174. minsPerResolution := cfg.DataResolutionMinutes
  175. durStr := pds.durationStringFor(start, end, minsPerResolution)
  176. if durStr == "" {
  177. panic("failed to parse duration string passed to QueryLocalStorageActiveMinutes")
  178. }
  179. queryLocalStorageActiveMins := fmt.Sprintf(localStorageActiveMinutesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  180. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  181. return source.NewFuture(source.DecodeLocalStorageActiveMinutesResult, ctx.QueryAtTime(queryLocalStorageActiveMins, end))
  182. }
  183. func (pds *PrometheusMetricsQuerier) QueryNodeCPUCoresCapacity(start, end time.Time) *source.Future[source.NodeCPUCoresCapacityResult] {
  184. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  185. const nodeCPUCoresCapacityQuery = `avg(avg_over_time(kube_node_status_capacity_cpu_cores{%s}[%s])) by (%s, node)`
  186. cfg := pds.promConfig
  187. durStr := timeutil.DurationString(end.Sub(start))
  188. if durStr == "" {
  189. panic("failed to parse duration string passed to QueryNodeCPUCoresCapacity")
  190. }
  191. queryNodeCPUCoresCapacity := fmt.Sprintf(nodeCPUCoresCapacityQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  192. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  193. return source.NewFuture(source.DecodeNodeCPUCoresCapacityResult, ctx.QueryAtTime(queryNodeCPUCoresCapacity, end))
  194. }
  195. func (pds *PrometheusMetricsQuerier) QueryNodeCPUCoresAllocatable(start, end time.Time) *source.Future[source.NodeCPUCoresAllocatableResult] {
  196. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  197. const nodeCPUCoresAllocatableQuery = `avg(avg_over_time(kube_node_status_allocatable_cpu_cores{%s}[%s])) by (%s, node)`
  198. // `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  199. cfg := pds.promConfig
  200. durStr := timeutil.DurationString(end.Sub(start))
  201. if durStr == "" {
  202. panic("failed to parse duration string passed to QueryNodeCPUCoresAllocatable")
  203. }
  204. queryNodeCPUCoresAllocatable := fmt.Sprintf(nodeCPUCoresAllocatableQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  205. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  206. return source.NewFuture(source.DecodeNodeCPUCoresAllocatableResult, ctx.QueryAtTime(queryNodeCPUCoresAllocatable, end))
  207. }
  208. func (pds *PrometheusMetricsQuerier) QueryNodeRAMBytesCapacity(start, end time.Time) *source.Future[source.NodeRAMBytesCapacityResult] {
  209. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  210. const nodeRAMBytesCapacityQuery = `avg(avg_over_time(kube_node_status_capacity_memory_bytes{%s}[%s])) by (%s, node)`
  211. cfg := pds.promConfig
  212. durStr := timeutil.DurationString(end.Sub(start))
  213. if durStr == "" {
  214. panic("failed to parse duration string passed to QueryNodeRAMBytesCapacity")
  215. }
  216. queryNodeRAMBytesCapacity := fmt.Sprintf(nodeRAMBytesCapacityQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  217. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  218. return source.NewFuture(source.DecodeNodeRAMBytesCapacityResult, ctx.QueryAtTime(queryNodeRAMBytesCapacity, end))
  219. }
  220. func (pds *PrometheusMetricsQuerier) QueryNodeRAMBytesAllocatable(start, end time.Time) *source.Future[source.NodeRAMBytesAllocatableResult] {
  221. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  222. const nodeRAMBytesAllocatableQuery = `avg(avg_over_time(kube_node_status_allocatable_memory_bytes{%s}[%s])) by (%s, node)`
  223. cfg := pds.promConfig
  224. durStr := timeutil.DurationString(end.Sub(start))
  225. if durStr == "" {
  226. panic("failed to parse duration string passed to QueryNodeRAMBytesAllocatable")
  227. }
  228. queryNodeRAMBytesAllocatable := fmt.Sprintf(nodeRAMBytesAllocatableQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  229. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  230. return source.NewFuture(source.DecodeNodeRAMBytesAllocatableResult, ctx.QueryAtTime(queryNodeRAMBytesAllocatable, end))
  231. }
  232. func (pds *PrometheusMetricsQuerier) QueryNodeGPUCount(start, end time.Time) *source.Future[source.NodeGPUCountResult] {
  233. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  234. const nodeGPUCountQuery = `avg(avg_over_time(node_gpu_count{%s}[%s])) by (%s, node, provider_id)`
  235. cfg := pds.promConfig
  236. durStr := timeutil.DurationString(end.Sub(start))
  237. if durStr == "" {
  238. panic("failed to parse duration string passed to QueryNodeGPUCount")
  239. }
  240. queryNodeGPUCount := fmt.Sprintf(nodeGPUCountQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  241. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  242. return source.NewFuture(source.DecodeNodeGPUCountResult, ctx.QueryAtTime(queryNodeGPUCount, end))
  243. }
  244. func (pds *PrometheusMetricsQuerier) QueryNodeLabels(start, end time.Time) *source.Future[source.NodeLabelsResult] {
  245. const labelsQuery = `avg_over_time(kube_node_labels{%s}[%s])`
  246. // env.GetPromClusterFilter(), durStr
  247. cfg := pds.promConfig
  248. durStr := timeutil.DurationString(end.Sub(start))
  249. if durStr == "" {
  250. panic("failed to parse duration string passed to QueryNodeLabels")
  251. }
  252. queryLabels := fmt.Sprintf(labelsQuery, cfg.ClusterFilter, durStr)
  253. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  254. return source.NewFuture(source.DecodeNodeLabelsResult, ctx.QueryAtTime(queryLabels, end))
  255. }
  256. func (pds *PrometheusMetricsQuerier) QueryNodeActiveMinutes(start, end time.Time) *source.Future[source.NodeActiveMinutesResult] {
  257. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  258. const activeMinsQuery = `avg(node_total_hourly_cost{%s}) by (node, %s, provider_id)[%s:%dm]`
  259. cfg := pds.promConfig
  260. minsPerResolution := cfg.DataResolutionMinutes
  261. durStr := pds.durationStringFor(start, end, minsPerResolution)
  262. if durStr == "" {
  263. panic("failed to parse duration string passed to QueryNodeActiveMinutes")
  264. }
  265. queryActiveMins := fmt.Sprintf(activeMinsQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  266. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  267. return source.NewFuture(source.DecodeNodeActiveMinutesResult, ctx.QueryAtTime(queryActiveMins, end))
  268. }
  269. func (pds *PrometheusMetricsQuerier) QueryNodeCPUModeTotal(start, end time.Time) *source.Future[source.NodeCPUModeTotalResult] {
  270. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel())
  271. const nodeCPUModeTotalQuery = `sum(rate(node_cpu_seconds_total{%s}[%s:%dm])) by (kubernetes_node, %s, mode)`
  272. cfg := pds.promConfig
  273. minsPerResolution := cfg.DataResolutionMinutes
  274. durStr := pds.durationStringFor(start, end, minsPerResolution)
  275. if durStr == "" {
  276. panic("failed to parse duration string passed to QueryNodeCPUModeTotal")
  277. }
  278. queryCPUModeTotal := fmt.Sprintf(nodeCPUModeTotalQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  279. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  280. return source.NewFuture(source.DecodeNodeCPUModeTotalResult, ctx.QueryAtTime(queryCPUModeTotal, end))
  281. }
  282. func (pds *PrometheusMetricsQuerier) QueryNodeRAMSystemPercent(start, end time.Time) *source.Future[source.NodeRAMSystemPercentResult] {
  283. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  284. const nodeRAMSystemPctQuery = `sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`
  285. cfg := pds.promConfig
  286. minsPerResolution := cfg.DataResolutionMinutes
  287. durStr := pds.durationStringFor(start, end, minsPerResolution)
  288. if durStr == "" {
  289. panic("failed to parse duration string passed to QueryNodeRAMSystemPercent")
  290. }
  291. queryRAMSystemPct := fmt.Sprintf(nodeRAMSystemPctQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterLabel)
  292. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  293. return source.NewFuture(source.DecodeNodeRAMSystemPercentResult, ctx.QueryAtTime(queryRAMSystemPct, end))
  294. }
  295. func (pds *PrometheusMetricsQuerier) QueryNodeRAMUserPercent(start, end time.Time) *source.Future[source.NodeRAMUserPercentResult] {
  296. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  297. const nodeRAMUserPctQuery = `sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`
  298. cfg := pds.promConfig
  299. minsPerResolution := cfg.DataResolutionMinutes
  300. durStr := pds.durationStringFor(start, end, minsPerResolution)
  301. if durStr == "" {
  302. panic("failed to parse duration string passed to QueryNodeRAMUserPercent")
  303. }
  304. queryRAMUserPct := fmt.Sprintf(nodeRAMUserPctQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterLabel)
  305. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  306. return source.NewFuture(source.DecodeNodeRAMUserPercentResult, ctx.QueryAtTime(queryRAMUserPct, end))
  307. }
  308. func (pds *PrometheusMetricsQuerier) QueryLBPricePerHr(start, end time.Time) *source.Future[source.LBPricePerHrResult] {
  309. const queryFmtLBCostPerHr = `avg(avg_over_time(kubecost_load_balancer_cost{%s}[%s])) by (namespace, service_name, ingress_ip, %s)`
  310. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  311. cfg := pds.promConfig
  312. durStr := timeutil.DurationString(end.Sub(start))
  313. if durStr == "" {
  314. panic("failed to parse duration string passed to QueryLBPricePerHr")
  315. }
  316. queryLBCostPerHr := fmt.Sprintf(queryFmtLBCostPerHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  317. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  318. return source.NewFuture(source.DecodeLBPricePerHrResult, ctx.QueryAtTime(queryLBCostPerHr, end))
  319. }
  320. func (pds *PrometheusMetricsQuerier) QueryLBActiveMinutes(start, end time.Time) *source.Future[source.LBActiveMinutesResult] {
  321. const lbActiveMinutesQuery = `avg(kubecost_load_balancer_cost{%s}) by (namespace, service_name, %s, ingress_ip)[%s:%dm]`
  322. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  323. cfg := pds.promConfig
  324. minsPerResolution := cfg.DataResolutionMinutes
  325. durStr := pds.durationStringFor(start, end, minsPerResolution)
  326. if durStr == "" {
  327. panic("failed to parse duration string passed to QueryLBActiveMinutes")
  328. }
  329. queryLBActiveMins := fmt.Sprintf(lbActiveMinutesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  330. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  331. return source.NewFuture(source.DecodeLBActiveMinutesResult, ctx.QueryAtTime(queryLBActiveMins, end))
  332. }
  333. func (pds *PrometheusMetricsQuerier) QueryClusterManagementDuration(start, end time.Time) *source.Future[source.ClusterManagementDurationResult] {
  334. const clusterManagementDurationQuery = `avg(kubecost_cluster_management_cost{%s}) by (%s, provisioner_name)[%s:%dm]`
  335. cfg := pds.promConfig
  336. minsPerResolution := cfg.DataResolutionMinutes
  337. durStr := pds.durationStringFor(start, end, minsPerResolution)
  338. if durStr == "" {
  339. panic("failed to parse duration string passed to QueryClusterManagementDuration")
  340. }
  341. queryClusterManagementDuration := fmt.Sprintf(clusterManagementDurationQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  342. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  343. return source.NewFuture(source.DecodeClusterManagementDurationResult, ctx.QueryAtTime(queryClusterManagementDuration, end))
  344. }
  345. func (pds *PrometheusMetricsQuerier) QueryClusterManagementPricePerHr(start, end time.Time) *source.Future[source.ClusterManagementPricePerHrResult] {
  346. const clusterManagementCostQuery = `avg(avg_over_time(kubecost_cluster_management_cost{%s}[%s])) by (%s, provisioner_name)`
  347. // env.GetPromClusterFilter(), durationStr, env.GetPromClusterLabel()
  348. cfg := pds.promConfig
  349. durStr := timeutil.DurationString(end.Sub(start))
  350. if durStr == "" {
  351. panic("failed to parse duration string passed to QueryClusterManagementCost")
  352. }
  353. queryClusterManagementCost := fmt.Sprintf(clusterManagementCostQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  354. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  355. return source.NewFuture(source.DecodeClusterManagementPricePerHrResult, ctx.QueryAtTime(queryClusterManagementCost, end))
  356. }
  357. // AllocationMetricQuerier
  358. func (pds *PrometheusMetricsQuerier) QueryPods(start, end time.Time) *source.Future[source.PodsResult] {
  359. const queryFmtPods = `avg(kube_pod_container_status_running{%s} != 0) by (pod, namespace, %s)[%s:%dm]`
  360. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  361. cfg := pds.promConfig
  362. minsPerResolution := cfg.DataResolutionMinutes
  363. durStr := pds.durationStringFor(start, end, minsPerResolution)
  364. if durStr == "" {
  365. panic("failed to parse duration string passed to QueryPods")
  366. }
  367. queryPods := fmt.Sprintf(queryFmtPods, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  368. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  369. return source.NewFuture(source.DecodePodsResult, ctx.QueryAtTime(queryPods, end))
  370. }
  371. func (pds *PrometheusMetricsQuerier) QueryPodsUID(start, end time.Time) *source.Future[source.PodsResult] {
  372. const queryFmtPodsUID = `avg(kube_pod_container_status_running{%s} != 0) by (pod, namespace, uid, %s)[%s:%dm]`
  373. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  374. cfg := pds.promConfig
  375. minsPerResolution := cfg.DataResolutionMinutes
  376. durStr := pds.durationStringFor(start, end, minsPerResolution)
  377. if durStr == "" {
  378. panic("failed to parse duration string passed to QueryPodsUID")
  379. }
  380. queryPodsUID := fmt.Sprintf(queryFmtPodsUID, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  381. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  382. return source.NewFuture(source.DecodePodsResult, ctx.QueryAtTime(queryPodsUID, end))
  383. }
  384. func (pds *PrometheusMetricsQuerier) QueryRAMBytesAllocated(start, end time.Time) *source.Future[source.RAMBytesAllocatedResult] {
  385. const queryFmtRAMBytesAllocated = `avg(avg_over_time(container_memory_allocation_bytes{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s, provider_id)`
  386. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  387. cfg := pds.promConfig
  388. durStr := timeutil.DurationString(end.Sub(start))
  389. if durStr == "" {
  390. panic("failed to parse duration string passed to QueryRAMBytesAllocated")
  391. }
  392. queryRAMBytesAllocated := fmt.Sprintf(queryFmtRAMBytesAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  393. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  394. return source.NewFuture(source.DecodeRAMBytesAllocatedResult, ctx.QueryAtTime(queryRAMBytesAllocated, end))
  395. }
  396. func (pds *PrometheusMetricsQuerier) QueryRAMRequests(start, end time.Time) *source.Future[source.RAMRequestsResult] {
  397. const queryFmtRAMRequests = `avg(avg_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  398. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  399. cfg := pds.promConfig
  400. durStr := timeutil.DurationString(end.Sub(start))
  401. if durStr == "" {
  402. panic("failed to parse duration string passed to QueryRAMRequests")
  403. }
  404. queryRAMRequests := fmt.Sprintf(queryFmtRAMRequests, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  405. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  406. return source.NewFuture(source.DecodeRAMRequestsResult, ctx.QueryAtTime(queryRAMRequests, end))
  407. }
  408. func (pds *PrometheusMetricsQuerier) QueryRAMUsageAvg(start, end time.Time) *source.Future[source.RAMUsageAvgResult] {
  409. const queryFmtRAMUsageAvg = `avg(avg_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, node, instance, %s)`
  410. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  411. cfg := pds.promConfig
  412. durStr := timeutil.DurationString(end.Sub(start))
  413. if durStr == "" {
  414. panic("failed to parse duration string passed to QueryRAMUsageAvg")
  415. }
  416. queryRAMUsageAvg := fmt.Sprintf(queryFmtRAMUsageAvg, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  417. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  418. return source.NewFuture(source.DecodeRAMUsageAvgResult, ctx.QueryAtTime(queryRAMUsageAvg, end))
  419. }
  420. func (pds *PrometheusMetricsQuerier) QueryRAMUsageMax(start, end time.Time) *source.Future[source.RAMUsageMaxResult] {
  421. const queryFmtRAMUsageMax = `max(max_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, node, instance, %s)`
  422. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  423. cfg := pds.promConfig
  424. durStr := timeutil.DurationString(end.Sub(start))
  425. if durStr == "" {
  426. panic("failed to parse duration string passed to QueryRAMUsageMax")
  427. }
  428. queryRAMUsageMax := fmt.Sprintf(queryFmtRAMUsageMax, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  429. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  430. return source.NewFuture(source.DecodeRAMUsageMaxResult, ctx.QueryAtTime(queryRAMUsageMax, end))
  431. }
  432. func (pds *PrometheusMetricsQuerier) QueryCPUCoresAllocated(start, end time.Time) *source.Future[source.CPUCoresAllocatedResult] {
  433. const queryFmtCPUCoresAllocated = `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  434. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  435. cfg := pds.promConfig
  436. durStr := timeutil.DurationString(end.Sub(start))
  437. if durStr == "" {
  438. panic("failed to parse duration string passed to QueryCPUCoresAllocated")
  439. }
  440. queryCPUCoresAllocated := fmt.Sprintf(queryFmtCPUCoresAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  441. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  442. return source.NewFuture(source.DecodeCPUCoresAllocatedResult, ctx.QueryAtTime(queryCPUCoresAllocated, end))
  443. }
  444. func (pds *PrometheusMetricsQuerier) QueryCPURequests(start, end time.Time) *source.Future[source.CPURequestsResult] {
  445. const queryFmtCPURequests = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  446. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  447. cfg := pds.promConfig
  448. durStr := timeutil.DurationString(end.Sub(start))
  449. if durStr == "" {
  450. panic("failed to parse duration string passed to QueryCPURequests")
  451. }
  452. queryCPURequests := fmt.Sprintf(queryFmtCPURequests, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  453. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  454. return source.NewFuture(source.DecodeCPURequestsResult, ctx.QueryAtTime(queryCPURequests, end))
  455. }
  456. func (pds *PrometheusMetricsQuerier) QueryCPUUsageAvg(start, end time.Time) *source.Future[source.CPUUsageAvgResult] {
  457. const queryFmtCPUUsageAvg = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, node, instance, %s)`
  458. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  459. cfg := pds.promConfig
  460. durStr := timeutil.DurationString(end.Sub(start))
  461. if durStr == "" {
  462. panic("failed to parse duration string passed to QueryCPUUsageAvg")
  463. }
  464. queryCPUUsageAvg := fmt.Sprintf(queryFmtCPUUsageAvg, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  465. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  466. return source.NewFuture(source.DecodeCPUUsageAvgResult, ctx.QueryAtTime(queryCPUUsageAvg, end))
  467. }
  468. func (pds *PrometheusMetricsQuerier) QueryCPUUsageMax(start, end time.Time) *source.Future[source.CPUUsageMaxResult] {
  469. // Because we use container_cpu_usage_seconds_total to calculate CPU usage
  470. // at any given "instant" of time, we need to use an irate or rate. To then
  471. // calculate a max (or any aggregation) we have to perform an aggregation
  472. // query on top of an instant-by-instant maximum. Prometheus supports this
  473. // type of query with a "subquery" [1], however it is reportedly expensive
  474. // to make such a query. By default, Kubecost's Prometheus config includes
  475. // a recording rule that keeps track of the instant-by-instant irate for CPU
  476. // usage. The metric in this query is created by that recording rule.
  477. //
  478. // [1] https://prometheus.io/blog/2019/01/28/subquery-support/
  479. //
  480. // If changing the name of the recording rule, make sure to update the
  481. // corresponding diagnostic query to avoid confusion.
  482. const queryFmtCPUUsageMaxRecordingRule = `max(max_over_time(kubecost_container_cpu_usage_irate{%s}[%s])) by (container_name, container, pod_name, pod, namespace, node, instance, %s)`
  483. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  484. // This is the subquery equivalent of the above recording rule query. It is
  485. // more expensive, but does not require the recording rule. It should be
  486. // used as a fallback query if the recording rule data does not exist.
  487. //
  488. // The parameter after the colon [:<thisone>] in the subquery affects the
  489. // resolution of the subquery.
  490. // The parameter after the metric ...{}[<thisone>] should be set to 2x
  491. // the resolution, to make sure the irate always has two points to query
  492. // in case the Prom scrape duration has been reduced to be equal to the
  493. // query resolution.
  494. const queryFmtCPUUsageMaxSubquery = `max(max_over_time(irate(container_cpu_usage_seconds_total{container!="POD", container!="", %s}[%dm])[%s:%dm])) by (container, pod_name, pod, namespace, node, instance, %s)`
  495. // env.GetPromClusterFilter(), doubleResStr, durStr, resStr, env.GetPromClusterLabel()
  496. cfg := pds.promConfig
  497. durStr := timeutil.DurationString(end.Sub(start))
  498. if durStr == "" {
  499. panic("failed to parse duration string passed to QueryCPUUsageMax")
  500. }
  501. queryCPUUsageMaxRecordingRule := fmt.Sprintf(queryFmtCPUUsageMaxRecordingRule, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  502. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  503. resCPUUsageMaxRR := ctx.QueryAtTime(queryCPUUsageMaxRecordingRule, end)
  504. resCPUUsageMax, _ := resCPUUsageMaxRR.Await()
  505. if len(resCPUUsageMax) > 0 {
  506. return wrapResults(queryCPUUsageMaxRecordingRule, source.DecodeCPUUsageMaxResult, resCPUUsageMax)
  507. }
  508. minsPerResolution := cfg.DataResolutionMinutes
  509. durStr = pds.durationStringFor(start, end, minsPerResolution)
  510. if durStr == "" {
  511. panic("failed to parse duration string passed to QueryCPUUsageMax")
  512. }
  513. queryCPUUsageMaxSubquery := fmt.Sprintf(queryFmtCPUUsageMaxSubquery, cfg.ClusterFilter, 2*minsPerResolution, durStr, minsPerResolution, cfg.ClusterLabel)
  514. return source.NewFuture(source.DecodeCPUUsageMaxResult, ctx.QueryAtTime(queryCPUUsageMaxSubquery, end))
  515. }
  516. func (pds *PrometheusMetricsQuerier) QueryGPUsRequested(start, end time.Time) *source.Future[source.GPUsRequestedResult] {
  517. const queryFmtGPUsRequested = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  518. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  519. cfg := pds.promConfig
  520. durStr := timeutil.DurationString(end.Sub(start))
  521. if durStr == "" {
  522. panic("failed to parse duration string passed to QueryGPUsRequested")
  523. }
  524. queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  525. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  526. return source.NewFuture(source.DecodeGPUsRequestedResult, ctx.QueryAtTime(queryGPUsRequested, end))
  527. }
  528. func (pds *PrometheusMetricsQuerier) QueryGPUsUsageAvg(start, end time.Time) *source.Future[source.GPUsUsageAvgResult] {
  529. const queryFmtGPUsUsageAvg = `avg(avg_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{container!=""}[%s])) by (container, pod, namespace, %s)`
  530. // durStr, env.GetPromClusterLabel()
  531. cfg := pds.promConfig
  532. durStr := timeutil.DurationString(end.Sub(start))
  533. if durStr == "" {
  534. panic("failed to parse duration string passed to QueryGPUsUsageAvg")
  535. }
  536. queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, cfg.ClusterLabel)
  537. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  538. return source.NewFuture(source.DecodeGPUsUsageAvgResult, ctx.QueryAtTime(queryGPUsUsageAvg, end))
  539. }
  540. func (pds *PrometheusMetricsQuerier) QueryGPUsUsageMax(start, end time.Time) *source.Future[source.GPUsUsageMaxResult] {
  541. const queryFmtGPUsUsageMax = `max(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{container!=""}[%s])) by (container, pod, namespace, %s)`
  542. // durStr, env.GetPromClusterLabel()
  543. cfg := pds.promConfig
  544. durStr := timeutil.DurationString(end.Sub(start))
  545. if durStr == "" {
  546. panic("failed to parse duration string passed to QueryGPUsUsageMax")
  547. }
  548. queryGPUsUsageMax := fmt.Sprintf(queryFmtGPUsUsageMax, durStr, cfg.ClusterLabel)
  549. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  550. return source.NewFuture(source.DecodeGPUsUsageMaxResult, ctx.QueryAtTime(queryGPUsUsageMax, end))
  551. }
  552. func (pds *PrometheusMetricsQuerier) QueryGPUsAllocated(start, end time.Time) *source.Future[source.GPUsAllocatedResult] {
  553. const queryFmtGPUsAllocated = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  554. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  555. cfg := pds.promConfig
  556. durStr := timeutil.DurationString(end.Sub(start))
  557. if durStr == "" {
  558. panic("failed to parse duration string passed to QueryGPUsAllocated")
  559. }
  560. queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  561. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  562. return source.NewFuture(source.DecodeGPUsAllocatedResult, ctx.QueryAtTime(queryGPUsAllocated, end))
  563. }
  564. func (pds *PrometheusMetricsQuerier) QueryIsGPUShared(start, end time.Time) *source.Future[source.IsGPUSharedResult] {
  565. const queryFmtIsGPUShared = `avg(avg_over_time(kube_pod_container_resource_requests{container!="", node != "", pod != "", container!= "", unit = "integer", %s}[%s])) by (container, pod, namespace, node, resource, %s)`
  566. // env.GetPromClusterFilter(), durStr
  567. cfg := pds.promConfig
  568. durStr := timeutil.DurationString(end.Sub(start))
  569. if durStr == "" {
  570. panic("failed to parse duration string passed to QueryIsGPUShared")
  571. }
  572. queryIsGPUShared := fmt.Sprintf(queryFmtIsGPUShared, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  573. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  574. return source.NewFuture(source.DecodeIsGPUSharedResult, ctx.QueryAtTime(queryIsGPUShared, end))
  575. }
  576. func (pds *PrometheusMetricsQuerier) QueryGPUInfo(start, end time.Time) *source.Future[source.GPUInfoResult] {
  577. const queryFmtGetGPUInfo = `avg(avg_over_time(DCGM_FI_DEV_DEC_UTIL{container!="",%s}[%s])) by (container, pod, namespace, device, modelName, UUID, %s)`
  578. // env.GetPromClusterFilter(), durStr
  579. cfg := pds.promConfig
  580. durStr := timeutil.DurationString(end.Sub(start))
  581. if durStr == "" {
  582. panic("failed to parse duration string passed to QueryGPUInfo")
  583. }
  584. queryGetGPUInfo := fmt.Sprintf(queryFmtGetGPUInfo, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  585. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  586. return source.NewFuture(source.DecodeGPUInfoResult, ctx.QueryAtTime(queryGetGPUInfo, end))
  587. }
  588. func (pds *PrometheusMetricsQuerier) QueryNodeCPUPricePerHr(start, end time.Time) *source.Future[source.NodeCPUPricePerHrResult] {
  589. const queryFmtNodeCostPerCPUHr = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  590. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  591. cfg := pds.promConfig
  592. durStr := timeutil.DurationString(end.Sub(start))
  593. if durStr == "" {
  594. panic("failed to parse duration string passed to QueryNodeCPUPricePerHr")
  595. }
  596. queryNodeCostPerCPUHr := fmt.Sprintf(queryFmtNodeCostPerCPUHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  597. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  598. return source.NewFuture(source.DecodeNodeCPUPricePerHrResult, ctx.QueryAtTime(queryNodeCostPerCPUHr, end))
  599. }
  600. func (pds *PrometheusMetricsQuerier) QueryNodeRAMPricePerGiBHr(start, end time.Time) *source.Future[source.NodeRAMPricePerGiBHrResult] {
  601. const queryFmtNodeCostPerRAMGiBHr = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  602. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  603. cfg := pds.promConfig
  604. durStr := timeutil.DurationString(end.Sub(start))
  605. if durStr == "" {
  606. panic("failed to parse duration string passed to QueryNodeRAMPricePerGiBHr")
  607. }
  608. queryNodeCostPerRAMGiBHr := fmt.Sprintf(queryFmtNodeCostPerRAMGiBHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  609. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  610. return source.NewFuture(source.DecodeNodeRAMPricePerGiBHrResult, ctx.QueryAtTime(queryNodeCostPerRAMGiBHr, end))
  611. }
  612. func (pds *PrometheusMetricsQuerier) QueryNodeGPUPricePerHr(start, end time.Time) *source.Future[source.NodeGPUPricePerHrResult] {
  613. const queryFmtNodeCostPerGPUHr = `avg(avg_over_time(node_gpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  614. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  615. cfg := pds.promConfig
  616. durStr := timeutil.DurationString(end.Sub(start))
  617. if durStr == "" {
  618. panic("failed to parse duration string passed to QueryNodeGPUPricePerHr")
  619. }
  620. queryNodeCostPerGPUHr := fmt.Sprintf(queryFmtNodeCostPerGPUHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  621. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  622. return source.NewFuture(source.DecodeNodeGPUPricePerHrResult, ctx.QueryAtTime(queryNodeCostPerGPUHr, end))
  623. }
  624. func (pds *PrometheusMetricsQuerier) QueryNodeIsSpot(start, end time.Time) *source.Future[source.NodeIsSpotResult] {
  625. const queryFmtNodeIsSpot = `avg_over_time(kubecost_node_is_spot{%s}[%s])`
  626. //`avg_over_time(kubecost_node_is_spot{%s}[%s:%dm])`
  627. // env.GetPromClusterFilter(), durStr)
  628. cfg := pds.promConfig
  629. durStr := timeutil.DurationString(end.Sub(start))
  630. if durStr == "" {
  631. panic("failed to parse duration string passed to QueryNodeIsSpot2")
  632. }
  633. queryNodeIsSpot := fmt.Sprintf(queryFmtNodeIsSpot, cfg.ClusterFilter, durStr)
  634. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  635. return source.NewFuture(source.DecodeNodeIsSpotResult, ctx.QueryAtTime(queryNodeIsSpot, end))
  636. }
  637. func (pds *PrometheusMetricsQuerier) QueryPodPVCAllocation(start, end time.Time) *source.Future[source.PodPVCAllocationResult] {
  638. const queryFmtPodPVCAllocation = `avg(avg_over_time(pod_pvc_allocation{%s}[%s])) by (persistentvolume, persistentvolumeclaim, pod, namespace, %s)`
  639. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  640. cfg := pds.promConfig
  641. durStr := timeutil.DurationString(end.Sub(start))
  642. if durStr == "" {
  643. panic("failed to parse duration string passed to QueryPodPVCAllocation")
  644. }
  645. queryPodPVCAllocation := fmt.Sprintf(queryFmtPodPVCAllocation, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  646. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  647. return source.NewFuture(source.DecodePodPVCAllocationResult, ctx.QueryAtTime(queryPodPVCAllocation, end))
  648. }
  649. func (pds *PrometheusMetricsQuerier) QueryPVCBytesRequested(start, end time.Time) *source.Future[source.PVCBytesRequestedResult] {
  650. const queryFmtPVCBytesRequested = `avg(avg_over_time(kube_persistentvolumeclaim_resource_requests_storage_bytes{%s}[%s])) by (persistentvolumeclaim, namespace, %s)`
  651. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  652. cfg := pds.promConfig
  653. durStr := timeutil.DurationString(end.Sub(start))
  654. if durStr == "" {
  655. panic("failed to parse duration string passed to QueryPVCBytesRequested")
  656. }
  657. queryPVCBytesRequested := fmt.Sprintf(queryFmtPVCBytesRequested, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  658. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  659. return source.NewFuture(source.DecodePVCBytesRequestedResult, ctx.QueryAtTime(queryPVCBytesRequested, end))
  660. }
  661. func (pds *PrometheusMetricsQuerier) QueryPVBytes(start, end time.Time) *source.Future[source.PVBytesResult] {
  662. const queryFmtPVBytes = `avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[%s])) by (persistentvolume, %s)`
  663. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  664. cfg := pds.promConfig
  665. durStr := timeutil.DurationString(end.Sub(start))
  666. if durStr == "" {
  667. panic("failed to parse duration string passed to QueryPVBytes")
  668. }
  669. queryPVBytes := fmt.Sprintf(queryFmtPVBytes, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  670. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  671. return source.NewFuture(source.DecodePVBytesResult, ctx.QueryAtTime(queryPVBytes, end))
  672. }
  673. func (pds *PrometheusMetricsQuerier) QueryPVInfo(start, end time.Time) *source.Future[source.PVInfoResult] {
  674. const queryFmtPVMeta = `avg(avg_over_time(kubecost_pv_info{%s}[%s])) by (%s, storageclass, persistentvolume, provider_id)`
  675. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  676. cfg := pds.promConfig
  677. durStr := timeutil.DurationString(end.Sub(start))
  678. if durStr == "" {
  679. panic("failed to parse duration string passed to QueryPVMeta")
  680. }
  681. queryPVMeta := fmt.Sprintf(queryFmtPVMeta, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  682. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  683. return source.NewFuture(source.DecodePVInfoResult, ctx.QueryAtTime(queryPVMeta, end))
  684. }
  685. func (pds *PrometheusMetricsQuerier) QueryNetZoneGiB(start, end time.Time) *source.Future[source.NetZoneGiBResult] {
  686. const queryFmtNetZoneGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", same_zone="false", same_region="true", %s}[%s:%dm])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  687. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  688. cfg := pds.promConfig
  689. minsPerResolution := cfg.DataResolutionMinutes
  690. durStr := pds.durationStringFor(start, end, minsPerResolution)
  691. if durStr == "" {
  692. panic("failed to parse duration string passed to QueryNetZoneGiB")
  693. }
  694. queryNetZoneGiB := fmt.Sprintf(queryFmtNetZoneGiB, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  695. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  696. return source.NewFuture(source.DecodeNetZoneGiBResult, ctx.QueryAtTime(queryNetZoneGiB, end))
  697. }
  698. func (pds *PrometheusMetricsQuerier) QueryNetZonePricePerGiB(start, end time.Time) *source.Future[source.NetZonePricePerGiBResult] {
  699. const queryFmtNetZoneCostPerGiB = `avg(avg_over_time(kubecost_network_zone_egress_cost{%s}[%s])) by (%s)`
  700. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  701. cfg := pds.promConfig
  702. durStr := timeutil.DurationString(end.Sub(start))
  703. if durStr == "" {
  704. panic("failed to parse duration string passed to QueryNetZonePricePerGiB")
  705. }
  706. queryNetZoneCostPerGiB := fmt.Sprintf(queryFmtNetZoneCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  707. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  708. return source.NewFuture(source.DecodeNetZonePricePerGiBResult, ctx.QueryAtTime(queryNetZoneCostPerGiB, end))
  709. }
  710. func (pds *PrometheusMetricsQuerier) QueryNetRegionGiB(start, end time.Time) *source.Future[source.NetRegionGiBResult] {
  711. const queryFmtNetRegionGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", same_zone="false", same_region="false", %s}[%s:%dm])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  712. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  713. cfg := pds.promConfig
  714. minsPerResolution := cfg.DataResolutionMinutes
  715. durStr := pds.durationStringFor(start, end, minsPerResolution)
  716. if durStr == "" {
  717. panic("failed to parse duration string passed to QueryNetRegionGiB")
  718. }
  719. queryNetRegionGiB := fmt.Sprintf(queryFmtNetRegionGiB, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  720. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  721. return source.NewFuture(source.DecodeNetRegionGiBResult, ctx.QueryAtTime(queryNetRegionGiB, end))
  722. }
  723. func (pds *PrometheusMetricsQuerier) QueryNetRegionPricePerGiB(start, end time.Time) *source.Future[source.NetRegionPricePerGiBResult] {
  724. const queryFmtNetRegionCostPerGiB = `avg(avg_over_time(kubecost_network_region_egress_cost{%s}[%s])) by (%s)`
  725. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  726. cfg := pds.promConfig
  727. durStr := timeutil.DurationString(end.Sub(start))
  728. if durStr == "" {
  729. panic("failed to parse duration string passed to QueryNetRegionPricePerGiB")
  730. }
  731. queryNetRegionCostPerGiB := fmt.Sprintf(queryFmtNetRegionCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  732. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  733. return source.NewFuture(source.DecodeNetRegionPricePerGiBResult, ctx.QueryAtTime(queryNetRegionCostPerGiB, end))
  734. }
  735. func (pds *PrometheusMetricsQuerier) QueryNetInternetGiB(start, end time.Time) *source.Future[source.NetInternetGiBResult] {
  736. const queryFmtNetInternetGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true", %s}[%s:%dm])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  737. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  738. cfg := pds.promConfig
  739. minsPerResolution := cfg.DataResolutionMinutes
  740. durStr := pds.durationStringFor(start, end, minsPerResolution)
  741. if durStr == "" {
  742. panic("failed to parse duration string passed to QueryNetInternetGiB")
  743. }
  744. queryNetInternetGiB := fmt.Sprintf(queryFmtNetInternetGiB, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  745. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  746. return source.NewFuture(source.DecodeNetInternetGiBResult, ctx.QueryAtTime(queryNetInternetGiB, end))
  747. }
  748. func (pds *PrometheusMetricsQuerier) QueryNetInternetPricePerGiB(start, end time.Time) *source.Future[source.NetInternetPricePerGiBResult] {
  749. const queryFmtNetInternetCostPerGiB = `avg(avg_over_time(kubecost_network_internet_egress_cost{%s}[%s])) by (%s)`
  750. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  751. cfg := pds.promConfig
  752. durStr := timeutil.DurationString(end.Sub(start))
  753. if durStr == "" {
  754. panic("failed to parse duration string passed to QueryNetInternetPricePerGiB")
  755. }
  756. queryNetInternetCostPerGiB := fmt.Sprintf(queryFmtNetInternetCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  757. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  758. return source.NewFuture(source.DecodeNetInternetPricePerGiBResult, ctx.QueryAtTime(queryNetInternetCostPerGiB, end))
  759. }
  760. func (pds *PrometheusMetricsQuerier) QueryNetInternetServiceGiB(start, end time.Time) *source.Future[source.NetInternetServiceGiBResult] {
  761. const queryFmtNetInternetGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true", %s}[%s:%dm])) by (pod_name, namespace, service, %s) / 1024 / 1024 / 1024`
  762. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  763. cfg := pds.promConfig
  764. minsPerResolution := cfg.DataResolutionMinutes
  765. durStr := pds.durationStringFor(start, end, minsPerResolution)
  766. if durStr == "" {
  767. panic("failed to parse duration string passed to QueryNetInternetGiB")
  768. }
  769. queryNetInternetGiB := fmt.Sprintf(queryFmtNetInternetGiB, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  770. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  771. return source.NewFuture(source.DecodeNetInternetServiceGiBResult, ctx.QueryAtTime(queryNetInternetGiB, end))
  772. }
  773. func (pds *PrometheusMetricsQuerier) QueryNetTransferBytes(start, end time.Time) *source.Future[source.NetTransferBytesResult] {
  774. const queryFmtNetTransferBytes = `sum(increase(container_network_transmit_bytes_total{pod!="", %s}[%s:%dm])) by (pod_name, pod, namespace, %s)`
  775. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  776. cfg := pds.promConfig
  777. minsPerResolution := cfg.DataResolutionMinutes
  778. durStr := pds.durationStringFor(start, end, minsPerResolution)
  779. if durStr == "" {
  780. panic("failed to parse duration string passed to QueryNetTransferBytes")
  781. }
  782. queryNetTransferBytes := fmt.Sprintf(queryFmtNetTransferBytes, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  783. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  784. return source.NewFuture(source.DecodeNetTransferBytesResult, ctx.QueryAtTime(queryNetTransferBytes, end))
  785. }
  786. func (pds *PrometheusMetricsQuerier) QueryNetZoneIngressGiB(start, end time.Time) *source.Future[source.NetZoneIngressGiBResult] {
  787. const queryFmtIngNetZoneGiB = `sum(increase(kubecost_pod_network_ingress_bytes_total{internet="false", same_zone="false", same_region="true", %s}[%s:%dm])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  788. cfg := pds.promConfig
  789. minsPerResolution := cfg.DataResolutionMinutes
  790. durStr := pds.durationStringFor(start, end, minsPerResolution)
  791. if durStr == "" {
  792. panic("failed to parse duration string passed to QueryNetZoneIngressGiB")
  793. }
  794. queryNetZoneCostPerGiB := fmt.Sprintf(queryFmtIngNetZoneGiB, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  795. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  796. return source.NewFuture(source.DecodeNetZoneIngressGiBResult, ctx.QueryAtTime(queryNetZoneCostPerGiB, end))
  797. }
  798. func (pds *PrometheusMetricsQuerier) QueryNetRegionIngressGiB(start, end time.Time) *source.Future[source.NetRegionIngressGiBResult] {
  799. const queryFmtIngNetRegionGiB = `sum(increase(kubecost_pod_network_ingress_bytes_total{internet="false", same_zone="false", same_region="false", %s}[%s:%dm])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  800. cfg := pds.promConfig
  801. minsPerResolution := cfg.DataResolutionMinutes
  802. durStr := pds.durationStringFor(start, end, minsPerResolution)
  803. if durStr == "" {
  804. panic("failed to parse duration string passed to QueryNetRegionIngressGiB")
  805. }
  806. queryNetRegionIngGiB := fmt.Sprintf(queryFmtIngNetRegionGiB, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  807. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  808. return source.NewFuture(source.DecodeNetRegionIngressGiBResult, ctx.QueryAtTime(queryNetRegionIngGiB, end))
  809. }
  810. func (pds *PrometheusMetricsQuerier) QueryNetInternetIngressGiB(start, end time.Time) *source.Future[source.NetInternetIngressGiBResult] {
  811. const queryFmtNetIngInternetGiB = `sum(increase(kubecost_pod_network_ingress_bytes_total{internet="true", %s}[%s:%dm])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  812. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  813. cfg := pds.promConfig
  814. minsPerResolution := cfg.DataResolutionMinutes
  815. durStr := pds.durationStringFor(start, end, minsPerResolution)
  816. if durStr == "" {
  817. panic("failed to parse duration string passed to QueryNetInternetIngressGiB")
  818. }
  819. queryNetIngInternetGiB := fmt.Sprintf(queryFmtNetIngInternetGiB, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  820. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  821. return source.NewFuture(source.DecodeNetInternetIngressGiBResult, ctx.QueryAtTime(queryNetIngInternetGiB, end))
  822. }
  823. func (pds *PrometheusMetricsQuerier) QueryNetInternetServiceIngressGiB(start, end time.Time) *source.Future[source.NetInternetServiceIngressGiBResult] {
  824. const queryFmtIngNetInternetGiB = `sum(increase(kubecost_pod_network_ingress_bytes_total{internet="true", %s}[%s:%dm])) by (pod_name, namespace, service, %s) / 1024 / 1024 / 1024`
  825. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  826. cfg := pds.promConfig
  827. minsPerResolution := cfg.DataResolutionMinutes
  828. durStr := pds.durationStringFor(start, end, minsPerResolution)
  829. if durStr == "" {
  830. panic("failed to parse duration string passed to QueryNetInternetServiceIngressGiB")
  831. }
  832. queryNetIngInternetGiB := fmt.Sprintf(queryFmtIngNetInternetGiB, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  833. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  834. return source.NewFuture(source.DecodeNetInternetServiceIngressGiBResult, ctx.QueryAtTime(queryNetIngInternetGiB, end))
  835. }
  836. func (pds *PrometheusMetricsQuerier) QueryNetReceiveBytes(start, end time.Time) *source.Future[source.NetReceiveBytesResult] {
  837. const queryFmtNetReceiveBytes = `sum(increase(container_network_receive_bytes_total{pod!="", %s}[%s:%dm])) by (pod_name, pod, namespace, %s)`
  838. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  839. cfg := pds.promConfig
  840. minsPerResolution := cfg.DataResolutionMinutes
  841. durStr := pds.durationStringFor(start, end, minsPerResolution)
  842. if durStr == "" {
  843. panic("failed to parse duration string passed to QueryNetReceiveBytes")
  844. }
  845. queryNetReceiveBytes := fmt.Sprintf(queryFmtNetReceiveBytes, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  846. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  847. return source.NewFuture(source.DecodeNetReceiveBytesResult, ctx.QueryAtTime(queryNetReceiveBytes, end))
  848. }
  849. func (pds *PrometheusMetricsQuerier) QueryNamespaceLabels(start, end time.Time) *source.Future[source.NamespaceLabelsResult] {
  850. const queryFmtNamespaceLabels = `avg_over_time(kube_namespace_labels{%s}[%s])`
  851. // env.GetPromClusterFilter(), durStr
  852. cfg := pds.promConfig
  853. durStr := timeutil.DurationString(end.Sub(start))
  854. if durStr == "" {
  855. panic("failed to parse duration string passed to QueryNamespaceLabels")
  856. }
  857. queryNamespaceLabels := fmt.Sprintf(queryFmtNamespaceLabels, cfg.ClusterFilter, durStr)
  858. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  859. return source.NewFuture(source.DecodeNamespaceLabelsResult, ctx.QueryAtTime(queryNamespaceLabels, end))
  860. }
  861. func (pds *PrometheusMetricsQuerier) QueryNamespaceAnnotations(start, end time.Time) *source.Future[source.NamespaceAnnotationsResult] {
  862. const queryFmtNamespaceAnnotations = `avg_over_time(kube_namespace_annotations{%s}[%s])`
  863. // env.GetPromClusterFilter(), durStr
  864. cfg := pds.promConfig
  865. durStr := timeutil.DurationString(end.Sub(start))
  866. if durStr == "" {
  867. panic("failed to parse duration string passed to QueryNamespaceAnnotations")
  868. }
  869. queryNamespaceAnnotations := fmt.Sprintf(queryFmtNamespaceAnnotations, cfg.ClusterFilter, durStr)
  870. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  871. return source.NewFuture(source.DecodeNamespaceAnnotationsResult, ctx.QueryAtTime(queryNamespaceAnnotations, end))
  872. }
  873. func (pds *PrometheusMetricsQuerier) QueryPodLabels(start, end time.Time) *source.Future[source.PodLabelsResult] {
  874. const queryFmtPodLabels = `avg_over_time(kube_pod_labels{%s}[%s])`
  875. // env.GetPromClusterFilter(), durStr
  876. cfg := pds.promConfig
  877. durStr := timeutil.DurationString(end.Sub(start))
  878. if durStr == "" {
  879. panic("failed to parse duration string passed to QueryPodLabels")
  880. }
  881. queryPodLabels := fmt.Sprintf(queryFmtPodLabels, cfg.ClusterFilter, durStr)
  882. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  883. return source.NewFuture(source.DecodePodLabelsResult, ctx.QueryAtTime(queryPodLabels, end))
  884. }
  885. func (pds *PrometheusMetricsQuerier) QueryPodAnnotations(start, end time.Time) *source.Future[source.PodAnnotationsResult] {
  886. const queryFmtPodAnnotations = `avg_over_time(kube_pod_annotations{%s}[%s])`
  887. // env.GetPromClusterFilter(), durStr
  888. cfg := pds.promConfig
  889. durStr := timeutil.DurationString(end.Sub(start))
  890. if durStr == "" {
  891. panic("failed to parse duration string passed to QueryPodAnnotations")
  892. }
  893. queryPodAnnotations := fmt.Sprintf(queryFmtPodAnnotations, cfg.ClusterFilter, durStr)
  894. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  895. return source.NewFuture(source.DecodePodAnnotationsResult, ctx.QueryAtTime(queryPodAnnotations, end))
  896. }
  897. func (pds *PrometheusMetricsQuerier) QueryServiceLabels(start, end time.Time) *source.Future[source.ServiceLabelsResult] {
  898. const queryFmtServiceLabels = `avg_over_time(service_selector_labels{%s}[%s])`
  899. // env.GetPromClusterFilter(), durStr
  900. cfg := pds.promConfig
  901. durStr := timeutil.DurationString(end.Sub(start))
  902. if durStr == "" {
  903. panic("failed to parse duration string passed to QueryServiceLabels")
  904. }
  905. queryServiceLabels := fmt.Sprintf(queryFmtServiceLabels, cfg.ClusterFilter, durStr)
  906. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  907. return source.NewFuture(source.DecodeServiceLabelsResult, ctx.QueryAtTime(queryServiceLabels, end))
  908. }
  909. func (pds *PrometheusMetricsQuerier) QueryDeploymentLabels(start, end time.Time) *source.Future[source.DeploymentLabelsResult] {
  910. const queryFmtDeploymentLabels = `avg_over_time(deployment_match_labels{%s}[%s])`
  911. // env.GetPromClusterFilter(), durStr
  912. cfg := pds.promConfig
  913. durStr := timeutil.DurationString(end.Sub(start))
  914. if durStr == "" {
  915. panic("failed to parse duration string passed to QueryNamespaceAnnotations")
  916. }
  917. queryDeploymentLabels := fmt.Sprintf(queryFmtDeploymentLabels, cfg.ClusterFilter, durStr)
  918. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  919. return source.NewFuture(source.DecodeDeploymentLabelsResult, ctx.QueryAtTime(queryDeploymentLabels, end))
  920. }
  921. func (pds *PrometheusMetricsQuerier) QueryStatefulSetLabels(start, end time.Time) *source.Future[source.StatefulSetLabelsResult] {
  922. const queryFmtStatefulSetLabels = `avg_over_time(statefulSet_match_labels{%s}[%s])`
  923. // env.GetPromClusterFilter(), durStr
  924. cfg := pds.promConfig
  925. durStr := timeutil.DurationString(end.Sub(start))
  926. if durStr == "" {
  927. panic("failed to parse duration string passed to QueryStatefulSetLabels")
  928. }
  929. queryStatefulSetLabels := fmt.Sprintf(queryFmtStatefulSetLabels, cfg.ClusterFilter, durStr)
  930. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  931. return source.NewFuture(source.DecodeStatefulSetLabelsResult, ctx.QueryAtTime(queryStatefulSetLabels, end))
  932. }
  933. func (pds *PrometheusMetricsQuerier) QueryDaemonSetLabels(start, end time.Time) *source.Future[source.DaemonSetLabelsResult] {
  934. const queryFmtDaemonSetLabels = `sum(avg_over_time(kube_pod_owner{owner_kind="DaemonSet", %s}[%s])) by (pod, owner_name, namespace, %s)`
  935. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  936. cfg := pds.promConfig
  937. durStr := timeutil.DurationString(end.Sub(start))
  938. if durStr == "" {
  939. panic("failed to parse duration string passed to QueryDaemonSetLabels")
  940. }
  941. queryDaemonSetLabels := fmt.Sprintf(queryFmtDaemonSetLabels, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  942. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  943. return source.NewFuture(source.DecodeDaemonSetLabelsResult, ctx.QueryAtTime(queryDaemonSetLabels, end))
  944. }
  945. func (pds *PrometheusMetricsQuerier) QueryJobLabels(start, end time.Time) *source.Future[source.JobLabelsResult] {
  946. const queryFmtJobLabels = `sum(avg_over_time(kube_pod_owner{owner_kind="Job", %s}[%s])) by (pod, owner_name, namespace ,%s)`
  947. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  948. cfg := pds.promConfig
  949. durStr := timeutil.DurationString(end.Sub(start))
  950. if durStr == "" {
  951. panic("failed to parse duration string passed to QueryJobLabels")
  952. }
  953. queryJobLabels := fmt.Sprintf(queryFmtJobLabels, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  954. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  955. return source.NewFuture(source.DecodeJobLabelsResult, ctx.QueryAtTime(queryJobLabels, end))
  956. }
  957. func (pds *PrometheusMetricsQuerier) QueryPodsWithReplicaSetOwner(start, end time.Time) *source.Future[source.PodsWithReplicaSetOwnerResult] {
  958. const queryFmtPodsWithReplicaSetOwner = `sum(avg_over_time(kube_pod_owner{owner_kind="ReplicaSet", %s}[%s])) by (pod, owner_name, namespace ,%s)`
  959. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  960. cfg := pds.promConfig
  961. durStr := timeutil.DurationString(end.Sub(start))
  962. if durStr == "" {
  963. panic("failed to parse duration string passed to QueryPodsWithReplicaSetOwner")
  964. }
  965. queryPodsWithReplicaSetOwner := fmt.Sprintf(queryFmtPodsWithReplicaSetOwner, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  966. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  967. return source.NewFuture(source.DecodePodsWithReplicaSetOwnerResult, ctx.QueryAtTime(queryPodsWithReplicaSetOwner, end))
  968. }
  969. func (pds *PrometheusMetricsQuerier) QueryReplicaSetsWithoutOwners(start, end time.Time) *source.Future[source.ReplicaSetsWithoutOwnersResult] {
  970. const queryFmtReplicaSetsWithoutOwners = `avg(avg_over_time(kube_replicaset_owner{owner_kind="<none>", owner_name="<none>", %s}[%s])) by (replicaset, namespace, %s)`
  971. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  972. cfg := pds.promConfig
  973. durStr := timeutil.DurationString(end.Sub(start))
  974. if durStr == "" {
  975. panic("failed to parse duration string passed to QueryReplicaSetsWithoutOwners")
  976. }
  977. queryReplicaSetsWithoutOwners := fmt.Sprintf(queryFmtReplicaSetsWithoutOwners, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  978. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  979. return source.NewFuture(source.DecodeReplicaSetsWithoutOwnersResult, ctx.QueryAtTime(queryReplicaSetsWithoutOwners, end))
  980. }
  981. func (pds *PrometheusMetricsQuerier) QueryReplicaSetsWithRollout(start, end time.Time) *source.Future[source.ReplicaSetsWithRolloutResult] {
  982. const queryFmtReplicaSetsWithRolloutOwner = `avg(avg_over_time(kube_replicaset_owner{owner_kind="Rollout", %s}[%s])) by (replicaset, namespace, owner_kind, owner_name, %s)`
  983. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  984. cfg := pds.promConfig
  985. durStr := timeutil.DurationString(end.Sub(start))
  986. if durStr == "" {
  987. panic("failed to parse duration string passed to QueryReplicaSetsWithRollout")
  988. }
  989. queryReplicaSetsWithRolloutOwner := fmt.Sprintf(queryFmtReplicaSetsWithRolloutOwner, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  990. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  991. return source.NewFuture(source.DecodeReplicaSetsWithRolloutResult, ctx.QueryAtTime(queryReplicaSetsWithRolloutOwner, end))
  992. }
  993. func (pds *PrometheusMetricsQuerier) QueryDataCoverage(limitDays int) (time.Time, time.Time, error) {
  994. const (
  995. queryFmtOldestSample = `min_over_time(timestamp(group(node_cpu_hourly_cost{%s}))[%s:%s])`
  996. queryFmtNewestSample = `max_over_time(timestamp(group(node_cpu_hourly_cost{%s}))[%s:%s])`
  997. )
  998. cfg := pds.promConfig
  999. minutesPerDuration := 60
  1000. dur := time.Duration(limitDays) * timeutil.Day
  1001. end := time.Now().UTC().Truncate(timeutil.Day).Add(timeutil.Day)
  1002. start := end.Add(-dur)
  1003. durStr := pds.durationStringFor(start, end, minutesPerDuration)
  1004. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1005. queryOldest := fmt.Sprintf(queryFmtOldestSample, cfg.ClusterFilter, durStr, "1h")
  1006. resOldestFut := ctx.QueryAtTime(queryOldest, end)
  1007. resOldest, err := resOldestFut.Await()
  1008. if err != nil {
  1009. return time.Time{}, time.Time{}, fmt.Errorf("querying oldest sample: %w", err)
  1010. }
  1011. if len(resOldest) == 0 || len(resOldest[0].Values) == 0 {
  1012. return time.Time{}, time.Time{}, fmt.Errorf("querying oldest sample: %w", err)
  1013. }
  1014. oldest := time.Unix(int64(resOldest[0].Values[0].Value), 0)
  1015. queryNewest := fmt.Sprintf(queryFmtNewestSample, cfg.ClusterFilter, durStr, "1h")
  1016. resNewestFut := ctx.QueryAtTime(queryNewest, end)
  1017. resNewest, err := resNewestFut.Await()
  1018. if err != nil {
  1019. return time.Time{}, time.Time{}, fmt.Errorf("querying newest sample: %w", err)
  1020. }
  1021. if len(resNewest) == 0 || len(resNewest[0].Values) == 0 {
  1022. return time.Time{}, time.Time{}, fmt.Errorf("querying newest sample: %w", err)
  1023. }
  1024. newest := time.Unix(int64(resNewest[0].Values[0].Value), 0)
  1025. return oldest, newest, nil
  1026. }
  1027. func (pds *PrometheusMetricsQuerier) durationStringFor(start, end time.Time, minsPerResolution int) string {
  1028. dur := end.Sub(start)
  1029. // If using a version of Prometheus where the resolution needs duration offset,
  1030. // we need to apply that here.
  1031. //
  1032. // E.g. avg(node_total_hourly_cost{}) by (node, provider_id)[60m:5m] with
  1033. // time=01:00:00 will return, for a node running the entire time, 12
  1034. // timestamps where the first is 00:05:00 and the last is 01:00:00.
  1035. // However, OpenCost expects for there to be 13 timestamps where the first
  1036. // begins at 00:00:00. To achieve this, we must modify our query to
  1037. // avg(node_total_hourly_cost{}) by (node, provider_id)[65m:5m]
  1038. if pds.promConfig.IsOffsetResolution {
  1039. // increase the query time by the resolution
  1040. dur = dur + (time.Duration(minsPerResolution) * time.Minute)
  1041. }
  1042. return timeutil.DurationString(dur)
  1043. }
  1044. func newEmptyResult[T any](decoder source.ResultDecoder[T]) *source.Future[T] {
  1045. ch := make(source.QueryResultsChan)
  1046. go func() {
  1047. results := source.NewQueryResults("")
  1048. ch <- results
  1049. }()
  1050. return source.NewFuture(decoder, ch)
  1051. }
  1052. func wrapResults[T any](query string, decoder source.ResultDecoder[T], results []*source.QueryResult) *source.Future[T] {
  1053. ch := make(source.QueryResultsChan)
  1054. go func() {
  1055. r := source.NewQueryResults(query)
  1056. r.Results = results
  1057. ch <- r
  1058. }()
  1059. return source.NewFuture(decoder, ch)
  1060. }