metricsquerier.go 66 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395
  1. package prom
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/opencost/opencost/core/pkg/source"
  6. "github.com/opencost/opencost/core/pkg/util/timeutil"
  7. prometheus "github.com/prometheus/client_golang/api"
  8. )
  9. //--------------------------------------------------------------------------
  10. // PrometheusMetricsQuerier
  11. //--------------------------------------------------------------------------
  12. // PrometheusMetricsQuerier is the implementation of the data source's MetricsQuerier interface for Prometheus.
  13. type PrometheusMetricsQuerier struct {
  14. promConfig *OpenCostPrometheusConfig
  15. promClient prometheus.Client
  16. promContexts *ContextFactory
  17. thanosConfig *OpenCostThanosConfig
  18. thanosClient prometheus.Client
  19. thanosContexts *ContextFactory
  20. }
  21. func newPrometheusMetricsQuerier(
  22. promConfig *OpenCostPrometheusConfig,
  23. promClient prometheus.Client,
  24. promContexts *ContextFactory,
  25. thanosConfig *OpenCostThanosConfig,
  26. thanosClient prometheus.Client,
  27. thanosContexts *ContextFactory,
  28. ) *PrometheusMetricsQuerier {
  29. return &PrometheusMetricsQuerier{
  30. promConfig: promConfig,
  31. promClient: promClient,
  32. promContexts: promContexts,
  33. thanosConfig: thanosConfig,
  34. thanosClient: thanosClient,
  35. thanosContexts: thanosContexts,
  36. }
  37. }
  38. func (pds *PrometheusMetricsQuerier) QueryPVPricePerGiBHour(start, end time.Time) *source.Future[source.PVPricePerGiBHourResult] {
  39. const pvCostQuery = `avg(avg_over_time(pv_hourly_cost{%s}[%s])) by (%s, persistentvolume, volumename, provider_id)`
  40. durStr := timeutil.DurationString(end.Sub(start))
  41. if durStr == "" {
  42. panic("failed to parse duration string passed to QueryPVCost")
  43. }
  44. queryPVCost := fmt.Sprintf(pvCostQuery, pds.promConfig.ClusterFilter, durStr, pds.promConfig.ClusterLabel)
  45. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  46. return source.NewFuture(source.DecodePVPricePerGiBHourResult, ctx.QueryAtTime(queryPVCost, end))
  47. }
  48. func (pds *PrometheusMetricsQuerier) QueryPVUsedAverage(start, end time.Time) *source.Future[source.PVUsedAvgResult] {
  49. // `avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  50. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  51. const pvUsedAverageQuery = `avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  52. cfg := pds.promConfig
  53. durStr := timeutil.DurationString(end.Sub(start))
  54. if durStr == "" {
  55. panic("failed to parse duration string passed to QueryPVUsedAverage")
  56. }
  57. queryPVUsedAvg := fmt.Sprintf(pvUsedAverageQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  58. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  59. return source.NewFuture(source.DecodePVUsedAvgResult, ctx.QueryAtTime(queryPVUsedAvg, end))
  60. }
  61. func (pds *PrometheusMetricsQuerier) QueryPVUsedMax(start, end time.Time) *source.Future[source.PVUsedMaxResult] {
  62. // `max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  63. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  64. const pvUsedMaxQuery = `max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  65. cfg := pds.promConfig
  66. durStr := timeutil.DurationString(end.Sub(start))
  67. if durStr == "" {
  68. panic("failed to parse duration string passed to QueryPVUsedMax")
  69. }
  70. queryPVUsedMax := fmt.Sprintf(pvUsedMaxQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  71. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  72. return source.NewFuture(source.DecodePVUsedMaxResult, ctx.QueryAtTime(queryPVUsedMax, end))
  73. }
  74. func (pds *PrometheusMetricsQuerier) QueryPVCInfo(start, end time.Time) *source.Future[source.PVCInfoResult] {
  75. const queryFmtPVCInfo = `avg(kube_persistentvolumeclaim_info{volumename != "", %s}) by (persistentvolumeclaim, storageclass, volumename, namespace, %s)[%s:%s]`
  76. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  77. cfg := pds.promConfig
  78. resolution := cfg.DataResolution
  79. resStr := timeutil.DurationString(resolution)
  80. durStr := timeutil.DurationString(end.Sub(start))
  81. if durStr == "" {
  82. panic("failed to parse duration string passed to QueryPVCInfo")
  83. }
  84. queryPVCInfo := fmt.Sprintf(queryFmtPVCInfo, cfg.ClusterFilter, cfg.ClusterLabel, durStr, resStr)
  85. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  86. return source.NewFuture(source.DecodePVCInfoResult, ctx.QueryAtTime(queryPVCInfo, end))
  87. }
  88. func (pds *PrometheusMetricsQuerier) QueryPVActiveMinutes(start, end time.Time) *source.Future[source.PVActiveMinutesResult] {
  89. const pvActiveMinsQuery = `avg(kube_persistentvolume_capacity_bytes{%s}) by (%s, persistentvolume)[%s:%dm]`
  90. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  91. cfg := pds.promConfig
  92. minsPerResolution := cfg.DataResolutionMinutes
  93. durStr := pds.durationStringFor(start, end, minsPerResolution)
  94. if durStr == "" {
  95. panic("failed to parse duration string passed to QueryPVActiveMinutes")
  96. }
  97. queryPVActiveMins := fmt.Sprintf(pvActiveMinsQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  98. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  99. return source.NewFuture(source.DecodePVActiveMinutesResult, ctx.QueryAtTime(queryPVActiveMins, end))
  100. }
  101. func (pds *PrometheusMetricsQuerier) QueryLocalStorageCost(start, end time.Time) *source.Future[source.LocalStorageCostResult] {
  102. // `sum_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  103. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  104. const localStorageCostQuery = `sum_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  105. cfg := pds.promConfig
  106. minsPerResolution := cfg.DataResolutionMinutes
  107. durStr := pds.durationStringFor(start, end, minsPerResolution)
  108. if durStr == "" {
  109. panic("failed to parse duration string passed to QueryLocalStorageCost")
  110. }
  111. // hourlyToCumulative is a scaling factor that, when multiplied by an
  112. // hourly value, converts it to a cumulative value; i.e. [$/hr] *
  113. // [min/res]*[hr/min] = [$/res]
  114. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  115. costPerGBHr := 0.04 / 730.0
  116. queryLocalStorageCost := fmt.Sprintf(localStorageCostQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  117. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  118. return source.NewFuture(source.DecodeLocalStorageCostResult, ctx.QueryAtTime(queryLocalStorageCost, end))
  119. }
  120. func (pds *PrometheusMetricsQuerier) QueryLocalStorageUsedCost(start, end time.Time) *source.Future[source.LocalStorageUsedCostResult] {
  121. // `sum_over_time(sum(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  122. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  123. const localStorageUsedCostQuery = `sum_over_time(sum(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  124. cfg := pds.promConfig
  125. minsPerResolution := cfg.DataResolutionMinutes
  126. durStr := pds.durationStringFor(start, end, minsPerResolution)
  127. if durStr == "" {
  128. panic("failed to parse duration string passed to QueryLocalStorageUsedCost")
  129. }
  130. // hourlyToCumulative is a scaling factor that, when multiplied by an
  131. // hourly value, converts it to a cumulative value; i.e. [$/hr] *
  132. // [min/res]*[hr/min] = [$/res]
  133. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  134. costPerGBHr := 0.04 / 730.0
  135. queryLocalStorageUsedCost := fmt.Sprintf(localStorageUsedCostQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  136. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  137. return source.NewFuture(source.DecodeLocalStorageUsedCostResult, ctx.QueryAtTime(queryLocalStorageUsedCost, end))
  138. }
  139. func (pds *PrometheusMetricsQuerier) QueryLocalStorageUsedAvg(start, end time.Time) *source.Future[source.LocalStorageUsedAvgResult] {
  140. // `avg(sum(avg_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  141. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  142. const localStorageUsedAvgQuery = `avg(sum(avg_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  143. cfg := pds.promConfig
  144. durStr := timeutil.DurationString(end.Sub(start))
  145. if durStr == "" {
  146. panic("failed to parse duration string passed to QueryLocalStorageUsedAvg")
  147. }
  148. queryLocalStorageUsedAvg := fmt.Sprintf(localStorageUsedAvgQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  149. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  150. return source.NewFuture(source.DecodeLocalStorageUsedAvgResult, ctx.QueryAtTime(queryLocalStorageUsedAvg, end))
  151. }
  152. func (pds *PrometheusMetricsQuerier) QueryLocalStorageUsedMax(start, end time.Time) *source.Future[source.LocalStorageUsedMaxResult] {
  153. // `max(sum(max_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  154. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  155. const localStorageUsedMaxQuery = `max(sum(max_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  156. cfg := pds.promConfig
  157. durStr := timeutil.DurationString(end.Sub(start))
  158. if durStr == "" {
  159. panic("failed to parse duration string passed to QueryLocalStorageUsedMax")
  160. }
  161. queryLocalStorageUsedMax := fmt.Sprintf(localStorageUsedMaxQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  162. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  163. return source.NewFuture(source.DecodeLocalStorageUsedMaxResult, ctx.QueryAtTime(queryLocalStorageUsedMax, end))
  164. }
  165. func (pds *PrometheusMetricsQuerier) QueryLocalStorageBytes(start, end time.Time) *source.Future[source.LocalStorageBytesResult] {
  166. // `avg_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm])`
  167. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  168. const localStorageBytesQuery = `avg_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm])`
  169. cfg := pds.promConfig
  170. minsPerResolution := cfg.DataResolutionMinutes
  171. durStr := pds.durationStringFor(start, end, minsPerResolution)
  172. if durStr == "" {
  173. panic("failed to parse duration string passed to QueryLocalStorageBytes")
  174. }
  175. queryLocalStorageBytes := fmt.Sprintf(localStorageBytesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  176. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  177. return source.NewFuture(source.DecodeLocalStorageBytesResult, ctx.QueryAtTime(queryLocalStorageBytes, end))
  178. }
  179. func (pds *PrometheusMetricsQuerier) QueryLocalStorageActiveMinutes(start, end time.Time) *source.Future[source.LocalStorageActiveMinutesResult] {
  180. // `count(node_total_hourly_cost{%s}) by (%s, node)[%s:%dm]`
  181. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  182. const localStorageActiveMinutesQuery = `count(node_total_hourly_cost{%s}) by (%s, node, instance, provider_id)[%s:%dm]`
  183. cfg := pds.promConfig
  184. minsPerResolution := cfg.DataResolutionMinutes
  185. durStr := pds.durationStringFor(start, end, minsPerResolution)
  186. if durStr == "" {
  187. panic("failed to parse duration string passed to QueryLocalStorageActiveMinutes")
  188. }
  189. queryLocalStorageActiveMins := fmt.Sprintf(localStorageActiveMinutesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  190. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  191. return source.NewFuture(source.DecodeLocalStorageActiveMinutesResult, ctx.QueryAtTime(queryLocalStorageActiveMins, end))
  192. }
  193. func (pds *PrometheusMetricsQuerier) QueryNodeCPUCoresCapacity(start, end time.Time) *source.Future[source.NodeCPUCoresCapacityResult] {
  194. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  195. const nodeCPUCoresCapacityQuery = `avg(avg_over_time(kube_node_status_capacity_cpu_cores{%s}[%s])) by (%s, node)`
  196. cfg := pds.promConfig
  197. durStr := timeutil.DurationString(end.Sub(start))
  198. if durStr == "" {
  199. panic("failed to parse duration string passed to QueryNodeCPUCoresCapacity")
  200. }
  201. queryNodeCPUCoresCapacity := fmt.Sprintf(nodeCPUCoresCapacityQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  202. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  203. return source.NewFuture(source.DecodeNodeCPUCoresCapacityResult, ctx.QueryAtTime(queryNodeCPUCoresCapacity, end))
  204. }
  205. func (pds *PrometheusMetricsQuerier) QueryNodeCPUCoresAllocatable(start, end time.Time) *source.Future[source.NodeCPUCoresAllocatableResult] {
  206. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  207. const nodeCPUCoresAllocatableQuery = `avg(avg_over_time(kube_node_status_allocatable_cpu_cores{%s}[%s])) by (%s, node)`
  208. // `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  209. cfg := pds.promConfig
  210. durStr := timeutil.DurationString(end.Sub(start))
  211. if durStr == "" {
  212. panic("failed to parse duration string passed to QueryNodeCPUCoresAllocatable")
  213. }
  214. queryNodeCPUCoresAllocatable := fmt.Sprintf(nodeCPUCoresAllocatableQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  215. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  216. return source.NewFuture(source.DecodeNodeCPUCoresAllocatableResult, ctx.QueryAtTime(queryNodeCPUCoresAllocatable, end))
  217. }
  218. func (pds *PrometheusMetricsQuerier) QueryNodeRAMBytesCapacity(start, end time.Time) *source.Future[source.NodeRAMBytesCapacityResult] {
  219. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  220. const nodeRAMBytesCapacityQuery = `avg(avg_over_time(kube_node_status_capacity_memory_bytes{%s}[%s])) by (%s, node)`
  221. cfg := pds.promConfig
  222. durStr := timeutil.DurationString(end.Sub(start))
  223. if durStr == "" {
  224. panic("failed to parse duration string passed to QueryNodeRAMBytesCapacity")
  225. }
  226. queryNodeRAMBytesCapacity := fmt.Sprintf(nodeRAMBytesCapacityQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  227. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  228. return source.NewFuture(source.DecodeNodeRAMBytesCapacityResult, ctx.QueryAtTime(queryNodeRAMBytesCapacity, end))
  229. }
  230. func (pds *PrometheusMetricsQuerier) QueryNodeRAMBytesAllocatable(start, end time.Time) *source.Future[source.NodeRAMBytesAllocatableResult] {
  231. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  232. const nodeRAMBytesAllocatableQuery = `avg(avg_over_time(kube_node_status_allocatable_memory_bytes{%s}[%s])) by (%s, node)`
  233. cfg := pds.promConfig
  234. durStr := timeutil.DurationString(end.Sub(start))
  235. if durStr == "" {
  236. panic("failed to parse duration string passed to QueryNodeRAMBytesAllocatable")
  237. }
  238. queryNodeRAMBytesAllocatable := fmt.Sprintf(nodeRAMBytesAllocatableQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  239. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  240. return source.NewFuture(source.DecodeNodeRAMBytesAllocatableResult, ctx.QueryAtTime(queryNodeRAMBytesAllocatable, end))
  241. }
  242. func (pds *PrometheusMetricsQuerier) QueryNodeGPUCount(start, end time.Time) *source.Future[source.NodeGPUCountResult] {
  243. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  244. const nodeGPUCountQuery = `avg(avg_over_time(node_gpu_count{%s}[%s])) by (%s, node, provider_id)`
  245. cfg := pds.promConfig
  246. durStr := timeutil.DurationString(end.Sub(start))
  247. if durStr == "" {
  248. panic("failed to parse duration string passed to QueryNodeGPUCount")
  249. }
  250. queryNodeGPUCount := fmt.Sprintf(nodeGPUCountQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  251. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  252. return source.NewFuture(source.DecodeNodeGPUCountResult, ctx.QueryAtTime(queryNodeGPUCount, end))
  253. }
  254. func (pds *PrometheusMetricsQuerier) QueryNodeLabels(start, end time.Time) *source.Future[source.NodeLabelsResult] {
  255. const labelsQuery = `avg_over_time(kube_node_labels{%s}[%s])`
  256. // env.GetPromClusterFilter(), durStr
  257. cfg := pds.promConfig
  258. durStr := timeutil.DurationString(end.Sub(start))
  259. if durStr == "" {
  260. panic("failed to parse duration string passed to QueryNodeLabels")
  261. }
  262. queryLabels := fmt.Sprintf(labelsQuery, cfg.ClusterFilter, durStr)
  263. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  264. return source.NewFuture(source.DecodeNodeLabelsResult, ctx.QueryAtTime(queryLabels, end))
  265. }
  266. func (pds *PrometheusMetricsQuerier) QueryNodeActiveMinutes(start, end time.Time) *source.Future[source.NodeActiveMinutesResult] {
  267. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  268. const activeMinsQuery = `avg(node_total_hourly_cost{%s}) by (node, %s, provider_id)[%s:%dm]`
  269. cfg := pds.promConfig
  270. minsPerResolution := cfg.DataResolutionMinutes
  271. durStr := pds.durationStringFor(start, end, minsPerResolution)
  272. if durStr == "" {
  273. panic("failed to parse duration string passed to QueryNodeActiveMinutes")
  274. }
  275. queryActiveMins := fmt.Sprintf(activeMinsQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  276. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  277. return source.NewFuture(source.DecodeNodeActiveMinutesResult, ctx.QueryAtTime(queryActiveMins, end))
  278. }
  279. func (pds *PrometheusMetricsQuerier) QueryNodeCPUModeTotal(start, end time.Time) *source.Future[source.NodeCPUModeTotalResult] {
  280. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel())
  281. const nodeCPUModeTotalQuery = `sum(rate(node_cpu_seconds_total{%s}[%s:%dm])) by (kubernetes_node, %s, mode)`
  282. cfg := pds.promConfig
  283. minsPerResolution := cfg.DataResolutionMinutes
  284. durStr := pds.durationStringFor(start, end, minsPerResolution)
  285. if durStr == "" {
  286. panic("failed to parse duration string passed to QueryNodeCPUModeTotal")
  287. }
  288. queryCPUModeTotal := fmt.Sprintf(nodeCPUModeTotalQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  289. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  290. return source.NewFuture(source.DecodeNodeCPUModeTotalResult, ctx.QueryAtTime(queryCPUModeTotal, end))
  291. }
  292. func (pds *PrometheusMetricsQuerier) QueryNodeRAMSystemPercent(start, end time.Time) *source.Future[source.NodeRAMSystemPercentResult] {
  293. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  294. const nodeRAMSystemPctQuery = `sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`
  295. cfg := pds.promConfig
  296. minsPerResolution := cfg.DataResolutionMinutes
  297. durStr := pds.durationStringFor(start, end, minsPerResolution)
  298. if durStr == "" {
  299. panic("failed to parse duration string passed to QueryNodeRAMSystemPercent")
  300. }
  301. queryRAMSystemPct := fmt.Sprintf(nodeRAMSystemPctQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterLabel)
  302. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  303. return source.NewFuture(source.DecodeNodeRAMSystemPercentResult, ctx.QueryAtTime(queryRAMSystemPct, end))
  304. }
  305. func (pds *PrometheusMetricsQuerier) QueryNodeRAMUserPercent(start, end time.Time) *source.Future[source.NodeRAMUserPercentResult] {
  306. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  307. const nodeRAMUserPctQuery = `sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`
  308. cfg := pds.promConfig
  309. minsPerResolution := cfg.DataResolutionMinutes
  310. durStr := pds.durationStringFor(start, end, minsPerResolution)
  311. if durStr == "" {
  312. panic("failed to parse duration string passed to QueryNodeRAMUserPercent")
  313. }
  314. queryRAMUserPct := fmt.Sprintf(nodeRAMUserPctQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterLabel)
  315. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  316. return source.NewFuture(source.DecodeNodeRAMUserPercentResult, ctx.QueryAtTime(queryRAMUserPct, end))
  317. }
  318. func (pds *PrometheusMetricsQuerier) QueryLBPricePerHr(start, end time.Time) *source.Future[source.LBPricePerHrResult] {
  319. const queryFmtLBCostPerHr = `avg(avg_over_time(kubecost_load_balancer_cost{%s}[%s])) by (namespace, service_name, ingress_ip, %s)`
  320. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  321. cfg := pds.promConfig
  322. durStr := timeutil.DurationString(end.Sub(start))
  323. if durStr == "" {
  324. panic("failed to parse duration string passed to QueryLBPricePerHr")
  325. }
  326. queryLBCostPerHr := fmt.Sprintf(queryFmtLBCostPerHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  327. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  328. return source.NewFuture(source.DecodeLBPricePerHrResult, ctx.QueryAtTime(queryLBCostPerHr, end))
  329. }
  330. func (pds *PrometheusMetricsQuerier) QueryLBActiveMinutes(start, end time.Time) *source.Future[source.LBActiveMinutesResult] {
  331. const lbActiveMinutesQuery = `avg(kubecost_load_balancer_cost{%s}) by (namespace, service_name, %s, ingress_ip)[%s:%dm]`
  332. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  333. cfg := pds.promConfig
  334. minsPerResolution := cfg.DataResolutionMinutes
  335. durStr := pds.durationStringFor(start, end, minsPerResolution)
  336. if durStr == "" {
  337. panic("failed to parse duration string passed to QueryLBActiveMinutes")
  338. }
  339. queryLBActiveMins := fmt.Sprintf(lbActiveMinutesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  340. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  341. return source.NewFuture(source.DecodeLBActiveMinutesResult, ctx.QueryAtTime(queryLBActiveMins, end))
  342. }
  343. func (pds *PrometheusMetricsQuerier) QueryClusterManagementDuration(start, end time.Time) *source.Future[source.ClusterManagementDurationResult] {
  344. const clusterManagementDurationQuery = `avg(kubecost_cluster_management_cost{%s}) by (%s, provisioner_name)[%s:%dm]`
  345. cfg := pds.promConfig
  346. minsPerResolution := cfg.DataResolutionMinutes
  347. durStr := pds.durationStringFor(start, end, minsPerResolution)
  348. if durStr == "" {
  349. panic("failed to parse duration string passed to QueryClusterManagementDuration")
  350. }
  351. queryClusterManagementDuration := fmt.Sprintf(clusterManagementDurationQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  352. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  353. return source.NewFuture(source.DecodeClusterManagementDurationResult, ctx.QueryAtTime(queryClusterManagementDuration, end))
  354. }
  355. func (pds *PrometheusMetricsQuerier) QueryClusterManagementPricePerHr(start, end time.Time) *source.Future[source.ClusterManagementPricePerHrResult] {
  356. const clusterManagementCostQuery = `avg(avg_over_time(kubecost_cluster_management_cost{%s}[%s])) by (%s, provisioner_name)`
  357. // env.GetPromClusterFilter(), durationStr, env.GetPromClusterLabel()
  358. cfg := pds.promConfig
  359. durStr := timeutil.DurationString(end.Sub(start))
  360. if durStr == "" {
  361. panic("failed to parse duration string passed to QueryClusterManagementCost")
  362. }
  363. queryClusterManagementCost := fmt.Sprintf(clusterManagementCostQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  364. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  365. return source.NewFuture(source.DecodeClusterManagementPricePerHrResult, ctx.QueryAtTime(queryClusterManagementCost, end))
  366. }
  367. // AllocationMetricQuerier
  368. func (pds *PrometheusMetricsQuerier) QueryPods(start, end time.Time) *source.Future[source.PodsResult] {
  369. const queryFmtPods = `avg(kube_pod_container_status_running{%s} != 0) by (pod, namespace, %s)[%s:%s]`
  370. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  371. cfg := pds.promConfig
  372. resolution := cfg.DataResolution
  373. resStr := timeutil.DurationString(resolution)
  374. durStr := timeutil.DurationString(end.Sub(start))
  375. if durStr == "" {
  376. panic("failed to parse duration string passed to QueryPods")
  377. }
  378. queryPods := fmt.Sprintf(queryFmtPods, cfg.ClusterFilter, cfg.ClusterLabel, durStr, resStr)
  379. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  380. return source.NewFuture(source.DecodePodsResult, ctx.QueryAtTime(queryPods, end))
  381. }
  382. func (pds *PrometheusMetricsQuerier) QueryPodsUID(start, end time.Time) *source.Future[source.PodsResult] {
  383. const queryFmtPodsUID = `avg(kube_pod_container_status_running{%s} != 0) by (pod, namespace, uid, %s)[%s:%s]`
  384. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  385. cfg := pds.promConfig
  386. resolution := cfg.DataResolution
  387. resStr := timeutil.DurationString(resolution)
  388. durStr := timeutil.DurationString(end.Sub(start))
  389. if durStr == "" {
  390. panic("failed to parse duration string passed to QueryPodsUID")
  391. }
  392. queryPodsUID := fmt.Sprintf(queryFmtPodsUID, cfg.ClusterFilter, cfg.ClusterLabel, durStr, resStr)
  393. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  394. return source.NewFuture(source.DecodePodsResult, ctx.QueryAtTime(queryPodsUID, end))
  395. }
  396. func (pds *PrometheusMetricsQuerier) QueryRAMBytesAllocated(start, end time.Time) *source.Future[source.RAMBytesAllocatedResult] {
  397. const queryFmtRAMBytesAllocated = `avg(avg_over_time(container_memory_allocation_bytes{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s, provider_id)`
  398. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  399. cfg := pds.promConfig
  400. durStr := timeutil.DurationString(end.Sub(start))
  401. if durStr == "" {
  402. panic("failed to parse duration string passed to QueryRAMBytesAllocated")
  403. }
  404. queryRAMBytesAllocated := fmt.Sprintf(queryFmtRAMBytesAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  405. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  406. return source.NewFuture(source.DecodeRAMBytesAllocatedResult, ctx.QueryAtTime(queryRAMBytesAllocated, end))
  407. }
  408. func (pds *PrometheusMetricsQuerier) QueryRAMRequests(start, end time.Time) *source.Future[source.RAMRequestsResult] {
  409. const queryFmtRAMRequests = `avg(avg_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  410. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  411. cfg := pds.promConfig
  412. durStr := timeutil.DurationString(end.Sub(start))
  413. if durStr == "" {
  414. panic("failed to parse duration string passed to QueryRAMRequests")
  415. }
  416. queryRAMRequests := fmt.Sprintf(queryFmtRAMRequests, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  417. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  418. return source.NewFuture(source.DecodeRAMRequestsResult, ctx.QueryAtTime(queryRAMRequests, end))
  419. }
  420. func (pds *PrometheusMetricsQuerier) QueryRAMUsageAvg(start, end time.Time) *source.Future[source.RAMUsageAvgResult] {
  421. const queryFmtRAMUsageAvg = `avg(avg_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, node, instance, %s)`
  422. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  423. cfg := pds.promConfig
  424. durStr := timeutil.DurationString(end.Sub(start))
  425. if durStr == "" {
  426. panic("failed to parse duration string passed to QueryRAMUsageAvg")
  427. }
  428. queryRAMUsageAvg := fmt.Sprintf(queryFmtRAMUsageAvg, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  429. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  430. return source.NewFuture(source.DecodeRAMUsageAvgResult, ctx.QueryAtTime(queryRAMUsageAvg, end))
  431. }
  432. func (pds *PrometheusMetricsQuerier) QueryRAMUsageMax(start, end time.Time) *source.Future[source.RAMUsageMaxResult] {
  433. const queryFmtRAMUsageMax = `max(max_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, node, instance, %s)`
  434. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  435. cfg := pds.promConfig
  436. durStr := timeutil.DurationString(end.Sub(start))
  437. if durStr == "" {
  438. panic("failed to parse duration string passed to QueryRAMUsageMax")
  439. }
  440. queryRAMUsageMax := fmt.Sprintf(queryFmtRAMUsageMax, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  441. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  442. return source.NewFuture(source.DecodeRAMUsageMaxResult, ctx.QueryAtTime(queryRAMUsageMax, end))
  443. }
  444. func (pds *PrometheusMetricsQuerier) QueryCPUCoresAllocated(start, end time.Time) *source.Future[source.CPUCoresAllocatedResult] {
  445. const queryFmtCPUCoresAllocated = `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  446. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  447. cfg := pds.promConfig
  448. durStr := timeutil.DurationString(end.Sub(start))
  449. if durStr == "" {
  450. panic("failed to parse duration string passed to QueryCPUCoresAllocated")
  451. }
  452. queryCPUCoresAllocated := fmt.Sprintf(queryFmtCPUCoresAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  453. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  454. return source.NewFuture(source.DecodeCPUCoresAllocatedResult, ctx.QueryAtTime(queryCPUCoresAllocated, end))
  455. }
  456. func (pds *PrometheusMetricsQuerier) QueryCPURequests(start, end time.Time) *source.Future[source.CPURequestsResult] {
  457. const queryFmtCPURequests = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  458. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  459. cfg := pds.promConfig
  460. durStr := timeutil.DurationString(end.Sub(start))
  461. if durStr == "" {
  462. panic("failed to parse duration string passed to QueryCPURequests")
  463. }
  464. queryCPURequests := fmt.Sprintf(queryFmtCPURequests, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  465. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  466. return source.NewFuture(source.DecodeCPURequestsResult, ctx.QueryAtTime(queryCPURequests, end))
  467. }
  468. func (pds *PrometheusMetricsQuerier) QueryCPUUsageAvg(start, end time.Time) *source.Future[source.CPUUsageAvgResult] {
  469. const queryFmtCPUUsageAvg = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, node, instance, %s)`
  470. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  471. cfg := pds.promConfig
  472. durStr := timeutil.DurationString(end.Sub(start))
  473. if durStr == "" {
  474. panic("failed to parse duration string passed to QueryCPUUsageAvg")
  475. }
  476. queryCPUUsageAvg := fmt.Sprintf(queryFmtCPUUsageAvg, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  477. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  478. return source.NewFuture(source.DecodeCPUUsageAvgResult, ctx.QueryAtTime(queryCPUUsageAvg, end))
  479. }
  480. func (pds *PrometheusMetricsQuerier) QueryCPUUsageMax(start, end time.Time) *source.Future[source.CPUUsageMaxResult] {
  481. // Because we use container_cpu_usage_seconds_total to calculate CPU usage
  482. // at any given "instant" of time, we need to use an irate or rate. To then
  483. // calculate a max (or any aggregation) we have to perform an aggregation
  484. // query on top of an instant-by-instant maximum. Prometheus supports this
  485. // type of query with a "subquery" [1], however it is reportedly expensive
  486. // to make such a query. By default, Kubecost's Prometheus config includes
  487. // a recording rule that keeps track of the instant-by-instant irate for CPU
  488. // usage. The metric in this query is created by that recording rule.
  489. //
  490. // [1] https://prometheus.io/blog/2019/01/28/subquery-support/
  491. //
  492. // If changing the name of the recording rule, make sure to update the
  493. // corresponding diagnostic query to avoid confusion.
  494. const queryFmtCPUUsageMaxRecordingRule = `max(max_over_time(kubecost_container_cpu_usage_irate{%s}[%s])) by (container_name, container, pod_name, pod, namespace, node, instance, %s)`
  495. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  496. // This is the subquery equivalent of the above recording rule query. It is
  497. // more expensive, but does not require the recording rule. It should be
  498. // used as a fallback query if the recording rule data does not exist.
  499. //
  500. // The parameter after the colon [:<thisone>] in the subquery affects the
  501. // resolution of the subquery.
  502. // The parameter after the metric ...{}[<thisone>] should be set to 2x
  503. // the resolution, to make sure the irate always has two points to query
  504. // in case the Prom scrape duration has been reduced to be equal to the
  505. // ETL resolution.
  506. const queryFmtCPUUsageMaxSubquery = `max(max_over_time(irate(container_cpu_usage_seconds_total{container!="POD", container!="", %s}[%s])[%s:%s])) by (container, pod_name, pod, namespace, node, instance, %s)`
  507. // env.GetPromClusterFilter(), doubleResStr, durStr, resStr, env.GetPromClusterLabel()
  508. cfg := pds.promConfig
  509. durStr := timeutil.DurationString(end.Sub(start))
  510. if durStr == "" {
  511. panic("failed to parse duration string passed to QueryCPUUsageMax")
  512. }
  513. queryCPUUsageMaxRecordingRule := fmt.Sprintf(queryFmtCPUUsageMaxRecordingRule, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  514. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  515. resCPUUsageMaxRR := ctx.QueryAtTime(queryCPUUsageMaxRecordingRule, end)
  516. resCPUUsageMax, _ := resCPUUsageMaxRR.Await()
  517. if len(resCPUUsageMax) > 0 {
  518. return wrapResults(queryCPUUsageMaxRecordingRule, source.DecodeCPUUsageMaxResult, resCPUUsageMax)
  519. }
  520. resolution := cfg.DataResolution
  521. resStr := timeutil.DurationString(resolution)
  522. doubleResStr := timeutil.DurationString(2 * resolution)
  523. queryCPUUsageMaxSubquery := fmt.Sprintf(queryFmtCPUUsageMaxSubquery, cfg.ClusterFilter, doubleResStr, durStr, resStr, cfg.ClusterLabel)
  524. return source.NewFuture(source.DecodeCPUUsageMaxResult, ctx.QueryAtTime(queryCPUUsageMaxSubquery, end))
  525. }
  526. func (pds *PrometheusMetricsQuerier) QueryGPUsRequested(start, end time.Time) *source.Future[source.GPUsRequestedResult] {
  527. const queryFmtGPUsRequested = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  528. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  529. cfg := pds.promConfig
  530. durStr := timeutil.DurationString(end.Sub(start))
  531. if durStr == "" {
  532. panic("failed to parse duration string passed to QueryGPUsRequested")
  533. }
  534. queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  535. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  536. return source.NewFuture(source.DecodeGPUsRequestedResult, ctx.QueryAtTime(queryGPUsRequested, end))
  537. }
  538. func (pds *PrometheusMetricsQuerier) QueryGPUsUsageAvg(start, end time.Time) *source.Future[source.GPUsUsageAvgResult] {
  539. const queryFmtGPUsUsageAvg = `avg(avg_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{container!=""}[%s])) by (container, pod, namespace, %s)`
  540. // durStr, env.GetPromClusterLabel()
  541. cfg := pds.promConfig
  542. durStr := timeutil.DurationString(end.Sub(start))
  543. if durStr == "" {
  544. panic("failed to parse duration string passed to QueryGPUsUsageAvg")
  545. }
  546. queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, cfg.ClusterLabel)
  547. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  548. return source.NewFuture(source.DecodeGPUsUsageAvgResult, ctx.QueryAtTime(queryGPUsUsageAvg, end))
  549. }
  550. func (pds *PrometheusMetricsQuerier) QueryGPUsUsageMax(start, end time.Time) *source.Future[source.GPUsUsageMaxResult] {
  551. const queryFmtGPUsUsageMax = `max(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{container!=""}[%s])) by (container, pod, namespace, %s)`
  552. // durStr, env.GetPromClusterLabel()
  553. cfg := pds.promConfig
  554. durStr := timeutil.DurationString(end.Sub(start))
  555. if durStr == "" {
  556. panic("failed to parse duration string passed to QueryGPUsUsageMax")
  557. }
  558. queryGPUsUsageMax := fmt.Sprintf(queryFmtGPUsUsageMax, durStr, cfg.ClusterLabel)
  559. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  560. return source.NewFuture(source.DecodeGPUsUsageMaxResult, ctx.QueryAtTime(queryGPUsUsageMax, end))
  561. }
  562. func (pds *PrometheusMetricsQuerier) QueryGPUsAllocated(start, end time.Time) *source.Future[source.GPUsAllocatedResult] {
  563. const queryFmtGPUsAllocated = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  564. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  565. cfg := pds.promConfig
  566. durStr := timeutil.DurationString(end.Sub(start))
  567. if durStr == "" {
  568. panic("failed to parse duration string passed to QueryGPUsAllocated")
  569. }
  570. queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  571. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  572. return source.NewFuture(source.DecodeGPUsAllocatedResult, ctx.QueryAtTime(queryGPUsAllocated, end))
  573. }
  574. func (pds *PrometheusMetricsQuerier) QueryIsGPUShared(start, end time.Time) *source.Future[source.IsGPUSharedResult] {
  575. const queryFmtIsGPUShared = `avg(avg_over_time(kube_pod_container_resource_requests{container!="", node != "", pod != "", container!= "", unit = "integer", %s}[%s])) by (container, pod, namespace, node, resource, %s)`
  576. // env.GetPromClusterFilter(), durStr
  577. cfg := pds.promConfig
  578. durStr := timeutil.DurationString(end.Sub(start))
  579. if durStr == "" {
  580. panic("failed to parse duration string passed to QueryIsGPUShared")
  581. }
  582. queryIsGPUShared := fmt.Sprintf(queryFmtIsGPUShared, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  583. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  584. return source.NewFuture(source.DecodeIsGPUSharedResult, ctx.QueryAtTime(queryIsGPUShared, end))
  585. }
  586. func (pds *PrometheusMetricsQuerier) QueryGPUInfo(start, end time.Time) *source.Future[source.GPUInfoResult] {
  587. const queryFmtGetGPUInfo = `avg(avg_over_time(DCGM_FI_DEV_DEC_UTIL{container!="",%s}[%s])) by (container, pod, namespace, device, modelName, UUID, %s)`
  588. // env.GetPromClusterFilter(), durStr
  589. cfg := pds.promConfig
  590. durStr := timeutil.DurationString(end.Sub(start))
  591. if durStr == "" {
  592. panic("failed to parse duration string passed to QueryGPUInfo")
  593. }
  594. queryGetGPUInfo := fmt.Sprintf(queryFmtGetGPUInfo, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  595. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  596. return source.NewFuture(source.DecodeGPUInfoResult, ctx.QueryAtTime(queryGetGPUInfo, end))
  597. }
  598. func (pds *PrometheusMetricsQuerier) QueryNodeCPUPricePerHr(start, end time.Time) *source.Future[source.NodeCPUPricePerHrResult] {
  599. const queryFmtNodeCostPerCPUHr = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  600. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  601. cfg := pds.promConfig
  602. durStr := timeutil.DurationString(end.Sub(start))
  603. if durStr == "" {
  604. panic("failed to parse duration string passed to QueryNodeCPUPricePerHr")
  605. }
  606. queryNodeCostPerCPUHr := fmt.Sprintf(queryFmtNodeCostPerCPUHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  607. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  608. return source.NewFuture(source.DecodeNodeCPUPricePerHrResult, ctx.QueryAtTime(queryNodeCostPerCPUHr, end))
  609. }
  610. func (pds *PrometheusMetricsQuerier) QueryNodeRAMPricePerGiBHr(start, end time.Time) *source.Future[source.NodeRAMPricePerGiBHrResult] {
  611. const queryFmtNodeCostPerRAMGiBHr = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  612. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  613. cfg := pds.promConfig
  614. durStr := timeutil.DurationString(end.Sub(start))
  615. if durStr == "" {
  616. panic("failed to parse duration string passed to QueryNodeRAMPricePerGiBHr")
  617. }
  618. queryNodeCostPerRAMGiBHr := fmt.Sprintf(queryFmtNodeCostPerRAMGiBHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  619. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  620. return source.NewFuture(source.DecodeNodeRAMPricePerGiBHrResult, ctx.QueryAtTime(queryNodeCostPerRAMGiBHr, end))
  621. }
  622. func (pds *PrometheusMetricsQuerier) QueryNodeGPUPricePerHr(start, end time.Time) *source.Future[source.NodeGPUPricePerHrResult] {
  623. const queryFmtNodeCostPerGPUHr = `avg(avg_over_time(node_gpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  624. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  625. cfg := pds.promConfig
  626. durStr := timeutil.DurationString(end.Sub(start))
  627. if durStr == "" {
  628. panic("failed to parse duration string passed to QueryNodeGPUPricePerHr")
  629. }
  630. queryNodeCostPerGPUHr := fmt.Sprintf(queryFmtNodeCostPerGPUHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  631. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  632. return source.NewFuture(source.DecodeNodeGPUPricePerHrResult, ctx.QueryAtTime(queryNodeCostPerGPUHr, end))
  633. }
  634. func (pds *PrometheusMetricsQuerier) QueryNodeIsSpot(start, end time.Time) *source.Future[source.NodeIsSpotResult] {
  635. const queryFmtNodeIsSpot = `avg_over_time(kubecost_node_is_spot{%s}[%s])`
  636. //`avg_over_time(kubecost_node_is_spot{%s}[%s:%dm])`
  637. // env.GetPromClusterFilter(), durStr)
  638. cfg := pds.promConfig
  639. durStr := timeutil.DurationString(end.Sub(start))
  640. if durStr == "" {
  641. panic("failed to parse duration string passed to QueryNodeIsSpot2")
  642. }
  643. queryNodeIsSpot := fmt.Sprintf(queryFmtNodeIsSpot, cfg.ClusterFilter, durStr)
  644. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  645. return source.NewFuture(source.DecodeNodeIsSpotResult, ctx.QueryAtTime(queryNodeIsSpot, end))
  646. }
  647. func (pds *PrometheusMetricsQuerier) QueryPodPVCAllocation(start, end time.Time) *source.Future[source.PodPVCAllocationResult] {
  648. const queryFmtPodPVCAllocation = `avg(avg_over_time(pod_pvc_allocation{%s}[%s])) by (persistentvolume, persistentvolumeclaim, pod, namespace, %s)`
  649. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  650. cfg := pds.promConfig
  651. durStr := timeutil.DurationString(end.Sub(start))
  652. if durStr == "" {
  653. panic("failed to parse duration string passed to QueryPodPVCAllocation")
  654. }
  655. queryPodPVCAllocation := fmt.Sprintf(queryFmtPodPVCAllocation, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  656. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  657. return source.NewFuture(source.DecodePodPVCAllocationResult, ctx.QueryAtTime(queryPodPVCAllocation, end))
  658. }
  659. func (pds *PrometheusMetricsQuerier) QueryPVCBytesRequested(start, end time.Time) *source.Future[source.PVCBytesRequestedResult] {
  660. const queryFmtPVCBytesRequested = `avg(avg_over_time(kube_persistentvolumeclaim_resource_requests_storage_bytes{%s}[%s])) by (persistentvolumeclaim, namespace, %s)`
  661. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  662. cfg := pds.promConfig
  663. durStr := timeutil.DurationString(end.Sub(start))
  664. if durStr == "" {
  665. panic("failed to parse duration string passed to QueryPVCBytesRequested")
  666. }
  667. queryPVCBytesRequested := fmt.Sprintf(queryFmtPVCBytesRequested, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  668. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  669. return source.NewFuture(source.DecodePVCBytesRequestedResult, ctx.QueryAtTime(queryPVCBytesRequested, end))
  670. }
  671. func (pds *PrometheusMetricsQuerier) QueryPVBytes(start, end time.Time) *source.Future[source.PVBytesResult] {
  672. const queryFmtPVBytes = `avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[%s])) by (persistentvolume, %s)`
  673. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  674. cfg := pds.promConfig
  675. durStr := timeutil.DurationString(end.Sub(start))
  676. if durStr == "" {
  677. panic("failed to parse duration string passed to QueryPVBytes")
  678. }
  679. queryPVBytes := fmt.Sprintf(queryFmtPVBytes, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  680. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  681. return source.NewFuture(source.DecodePVBytesResult, ctx.QueryAtTime(queryPVBytes, end))
  682. }
  683. func (pds *PrometheusMetricsQuerier) QueryPVInfo(start, end time.Time) *source.Future[source.PVInfoResult] {
  684. const queryFmtPVMeta = `avg(avg_over_time(kubecost_pv_info{%s}[%s])) by (%s, storageclass, persistentvolume, provider_id)`
  685. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  686. cfg := pds.promConfig
  687. durStr := timeutil.DurationString(end.Sub(start))
  688. if durStr == "" {
  689. panic("failed to parse duration string passed to QueryPVMeta")
  690. }
  691. queryPVMeta := fmt.Sprintf(queryFmtPVMeta, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  692. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  693. return source.NewFuture(source.DecodePVInfoResult, ctx.QueryAtTime(queryPVMeta, end))
  694. }
  695. func (pds *PrometheusMetricsQuerier) QueryNetZoneGiB(start, end time.Time) *source.Future[source.NetZoneGiBResult] {
  696. const queryFmtNetZoneGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", same_zone="false", same_region="true", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  697. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  698. cfg := pds.promConfig
  699. durStr := timeutil.DurationString(end.Sub(start))
  700. if durStr == "" {
  701. panic("failed to parse duration string passed to QueryNetZoneGiB")
  702. }
  703. queryNetZoneGiB := fmt.Sprintf(queryFmtNetZoneGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  704. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  705. return source.NewFuture(source.DecodeNetZoneGiBResult, ctx.QueryAtTime(queryNetZoneGiB, end))
  706. }
  707. func (pds *PrometheusMetricsQuerier) QueryNetZonePricePerGiB(start, end time.Time) *source.Future[source.NetZonePricePerGiBResult] {
  708. const queryFmtNetZoneCostPerGiB = `avg(avg_over_time(kubecost_network_zone_egress_cost{%s}[%s])) by (%s)`
  709. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  710. cfg := pds.promConfig
  711. durStr := timeutil.DurationString(end.Sub(start))
  712. if durStr == "" {
  713. panic("failed to parse duration string passed to QueryNetZonePricePerGiB")
  714. }
  715. queryNetZoneCostPerGiB := fmt.Sprintf(queryFmtNetZoneCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  716. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  717. return source.NewFuture(source.DecodeNetZonePricePerGiBResult, ctx.QueryAtTime(queryNetZoneCostPerGiB, end))
  718. }
  719. func (pds *PrometheusMetricsQuerier) QueryNetRegionGiB(start, end time.Time) *source.Future[source.NetRegionGiBResult] {
  720. const queryFmtNetRegionGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", same_zone="false", same_region="false", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  721. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  722. cfg := pds.promConfig
  723. durStr := timeutil.DurationString(end.Sub(start))
  724. if durStr == "" {
  725. panic("failed to parse duration string passed to QueryNetRegionGiB")
  726. }
  727. queryNetRegionGiB := fmt.Sprintf(queryFmtNetRegionGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  728. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  729. return source.NewFuture(source.DecodeNetRegionGiBResult, ctx.QueryAtTime(queryNetRegionGiB, end))
  730. }
  731. func (pds *PrometheusMetricsQuerier) QueryNetRegionPricePerGiB(start, end time.Time) *source.Future[source.NetRegionPricePerGiBResult] {
  732. const queryFmtNetRegionCostPerGiB = `avg(avg_over_time(kubecost_network_region_egress_cost{%s}[%s])) by (%s)`
  733. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  734. cfg := pds.promConfig
  735. durStr := timeutil.DurationString(end.Sub(start))
  736. if durStr == "" {
  737. panic("failed to parse duration string passed to QueryNetRegionPricePerGiB")
  738. }
  739. queryNetRegionCostPerGiB := fmt.Sprintf(queryFmtNetRegionCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  740. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  741. return source.NewFuture(source.DecodeNetRegionPricePerGiBResult, ctx.QueryAtTime(queryNetRegionCostPerGiB, end))
  742. }
  743. func (pds *PrometheusMetricsQuerier) QueryNetInternetGiB(start, end time.Time) *source.Future[source.NetInternetGiBResult] {
  744. const queryFmtNetInternetGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  745. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  746. cfg := pds.promConfig
  747. durStr := timeutil.DurationString(end.Sub(start))
  748. if durStr == "" {
  749. panic("failed to parse duration string passed to QueryNetInternetGiB")
  750. }
  751. queryNetInternetGiB := fmt.Sprintf(queryFmtNetInternetGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  752. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  753. return source.NewFuture(source.DecodeNetInternetGiBResult, ctx.QueryAtTime(queryNetInternetGiB, end))
  754. }
  755. func (pds *PrometheusMetricsQuerier) QueryNetInternetPricePerGiB(start, end time.Time) *source.Future[source.NetInternetPricePerGiBResult] {
  756. const queryFmtNetInternetCostPerGiB = `avg(avg_over_time(kubecost_network_internet_egress_cost{%s}[%s])) by (%s)`
  757. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  758. cfg := pds.promConfig
  759. durStr := timeutil.DurationString(end.Sub(start))
  760. if durStr == "" {
  761. panic("failed to parse duration string passed to QueryNetInternetPricePerGiB")
  762. }
  763. queryNetInternetCostPerGiB := fmt.Sprintf(queryFmtNetInternetCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  764. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  765. return source.NewFuture(source.DecodeNetInternetPricePerGiBResult, ctx.QueryAtTime(queryNetInternetCostPerGiB, end))
  766. }
  767. func (pds *PrometheusMetricsQuerier) QueryNetInternetServiceGiB(start, end time.Time) *source.Future[source.NetInternetServiceGiBResult] {
  768. const queryFmtNetInternetGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true", %s}[%s])) by (pod_name, namespace, service, %s) / 1024 / 1024 / 1024`
  769. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  770. cfg := pds.promConfig
  771. durStr := timeutil.DurationString(end.Sub(start))
  772. if durStr == "" {
  773. panic("failed to parse duration string passed to QueryNetInternetGiB")
  774. }
  775. queryNetInternetGiB := fmt.Sprintf(queryFmtNetInternetGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  776. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  777. return source.NewFuture(source.DecodeNetInternetServiceGiBResult, ctx.QueryAtTime(queryNetInternetGiB, end))
  778. }
  779. func (pds *PrometheusMetricsQuerier) QueryNetTransferBytes(start, end time.Time) *source.Future[source.NetTransferBytesResult] {
  780. const queryFmtNetTransferBytes = `sum(increase(container_network_transmit_bytes_total{pod!="", %s}[%s])) by (pod_name, pod, namespace, %s)`
  781. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  782. cfg := pds.promConfig
  783. durStr := timeutil.DurationString(end.Sub(start))
  784. if durStr == "" {
  785. panic("failed to parse duration string passed to QueryNetTransferBytes")
  786. }
  787. queryNetTransferBytes := fmt.Sprintf(queryFmtNetTransferBytes, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  788. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  789. return source.NewFuture(source.DecodeNetTransferBytesResult, ctx.QueryAtTime(queryNetTransferBytes, end))
  790. }
  791. func (pds *PrometheusMetricsQuerier) QueryNetZoneIngressGiB(start, end time.Time) *source.Future[source.NetZoneIngressGiBResult] {
  792. const queryFmtIngNetZoneGiB = `sum(increase(kubecost_pod_network_ingress_bytes_total{internet="false", same_zone="false", same_region="true", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  793. cfg := pds.promConfig
  794. durStr := timeutil.DurationString(end.Sub(start))
  795. if durStr == "" {
  796. panic("failed to parse duration string passed to QueryNetZoneIngressGiB")
  797. }
  798. queryNetZoneCostPerGiB := fmt.Sprintf(queryFmtIngNetZoneGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  799. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  800. return source.NewFuture(source.DecodeNetZoneIngressGiBResult, ctx.QueryAtTime(queryNetZoneCostPerGiB, end))
  801. }
  802. func (pds *PrometheusMetricsQuerier) QueryNetRegionIngressGiB(start, end time.Time) *source.Future[source.NetRegionIngressGiBResult] {
  803. const queryFmtIngNetRegionGiB = `sum(increase(kubecost_pod_network_ingress_bytes_total{internet="false", same_zone="false", same_region="false", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  804. cfg := pds.promConfig
  805. durStr := timeutil.DurationString(end.Sub(start))
  806. if durStr == "" {
  807. panic("failed to parse duration string passed to QueryNetRegionIngressGiB")
  808. }
  809. queryNetRegionIngGiB := fmt.Sprintf(queryFmtIngNetRegionGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  810. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  811. return source.NewFuture(source.DecodeNetRegionIngressGiBResult, ctx.QueryAtTime(queryNetRegionIngGiB, end))
  812. }
  813. func (pds *PrometheusMetricsQuerier) QueryNetInternetIngressGiB(start, end time.Time) *source.Future[source.NetInternetIngressGiBResult] {
  814. const queryFmtNetIngInternetGiB = `sum(increase(kubecost_pod_network_ingress_bytes_total{internet="true", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  815. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  816. cfg := pds.promConfig
  817. durStr := timeutil.DurationString(end.Sub(start))
  818. if durStr == "" {
  819. panic("failed to parse duration string passed to QueryNetInternetIngressGiB")
  820. }
  821. queryNetIngInternetGiB := fmt.Sprintf(queryFmtNetIngInternetGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  822. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  823. return source.NewFuture(source.DecodeNetInternetIngressGiBResult, ctx.QueryAtTime(queryNetIngInternetGiB, end))
  824. }
  825. func (pds *PrometheusMetricsQuerier) QueryNetInternetServiceIngressGiB(start, end time.Time) *source.Future[source.NetInternetServiceIngressGiBResult] {
  826. const queryFmtIngNetInternetGiB = `sum(increase(kubecost_pod_network_ingress_bytes_total{internet="true", %s}[%s])) by (pod_name, namespace, service, %s) / 1024 / 1024 / 1024`
  827. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  828. cfg := pds.promConfig
  829. durStr := timeutil.DurationString(end.Sub(start))
  830. if durStr == "" {
  831. panic("failed to parse duration string passed to QueryNetInternetServiceIngressGiB")
  832. }
  833. queryNetIngInternetGiB := fmt.Sprintf(queryFmtIngNetInternetGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  834. ctx := pds.promContexts.NewNamedContext(NetworkInsightsContextName)
  835. return source.NewFuture(source.DecodeNetInternetServiceIngressGiBResult, ctx.QueryAtTime(queryNetIngInternetGiB, end))
  836. }
  837. func (pds *PrometheusMetricsQuerier) QueryNetReceiveBytes(start, end time.Time) *source.Future[source.NetReceiveBytesResult] {
  838. const queryFmtNetReceiveBytes = `sum(increase(container_network_receive_bytes_total{pod!="", %s}[%s])) by (pod_name, pod, namespace, %s)`
  839. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  840. cfg := pds.promConfig
  841. durStr := timeutil.DurationString(end.Sub(start))
  842. if durStr == "" {
  843. panic("failed to parse duration string passed to QueryNetReceiveBytes")
  844. }
  845. queryNetReceiveBytes := fmt.Sprintf(queryFmtNetReceiveBytes, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  846. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  847. return source.NewFuture(source.DecodeNetReceiveBytesResult, ctx.QueryAtTime(queryNetReceiveBytes, end))
  848. }
  849. func (pds *PrometheusMetricsQuerier) QueryNamespaceLabels(start, end time.Time) *source.Future[source.NamespaceLabelsResult] {
  850. const queryFmtNamespaceLabels = `avg_over_time(kube_namespace_labels{%s}[%s])`
  851. // env.GetPromClusterFilter(), durStr
  852. cfg := pds.promConfig
  853. durStr := timeutil.DurationString(end.Sub(start))
  854. if durStr == "" {
  855. panic("failed to parse duration string passed to QueryNamespaceLabels")
  856. }
  857. queryNamespaceLabels := fmt.Sprintf(queryFmtNamespaceLabels, cfg.ClusterFilter, durStr)
  858. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  859. return source.NewFuture(source.DecodeNamespaceLabelsResult, ctx.QueryAtTime(queryNamespaceLabels, end))
  860. }
  861. func (pds *PrometheusMetricsQuerier) QueryNamespaceAnnotations(start, end time.Time) *source.Future[source.NamespaceAnnotationsResult] {
  862. const queryFmtNamespaceAnnotations = `avg_over_time(kube_namespace_annotations{%s}[%s])`
  863. // env.GetPromClusterFilter(), durStr
  864. cfg := pds.promConfig
  865. durStr := timeutil.DurationString(end.Sub(start))
  866. if durStr == "" {
  867. panic("failed to parse duration string passed to QueryNamespaceAnnotations")
  868. }
  869. queryNamespaceAnnotations := fmt.Sprintf(queryFmtNamespaceAnnotations, cfg.ClusterFilter, durStr)
  870. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  871. return source.NewFuture(source.DecodeNamespaceAnnotationsResult, ctx.QueryAtTime(queryNamespaceAnnotations, end))
  872. }
  873. func (pds *PrometheusMetricsQuerier) QueryPodLabels(start, end time.Time) *source.Future[source.PodLabelsResult] {
  874. const queryFmtPodLabels = `avg_over_time(kube_pod_labels{%s}[%s])`
  875. // env.GetPromClusterFilter(), durStr
  876. cfg := pds.promConfig
  877. durStr := timeutil.DurationString(end.Sub(start))
  878. if durStr == "" {
  879. panic("failed to parse duration string passed to QueryPodLabels")
  880. }
  881. queryPodLabels := fmt.Sprintf(queryFmtPodLabels, cfg.ClusterFilter, durStr)
  882. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  883. return source.NewFuture(source.DecodePodLabelsResult, ctx.QueryAtTime(queryPodLabels, end))
  884. }
  885. func (pds *PrometheusMetricsQuerier) QueryPodAnnotations(start, end time.Time) *source.Future[source.PodAnnotationsResult] {
  886. const queryFmtPodAnnotations = `avg_over_time(kube_pod_annotations{%s}[%s])`
  887. // env.GetPromClusterFilter(), durStr
  888. cfg := pds.promConfig
  889. durStr := timeutil.DurationString(end.Sub(start))
  890. if durStr == "" {
  891. panic("failed to parse duration string passed to QueryPodAnnotations")
  892. }
  893. queryPodAnnotations := fmt.Sprintf(queryFmtPodAnnotations, cfg.ClusterFilter, durStr)
  894. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  895. return source.NewFuture(source.DecodePodAnnotationsResult, ctx.QueryAtTime(queryPodAnnotations, end))
  896. }
  897. func (pds *PrometheusMetricsQuerier) QueryServiceLabels(start, end time.Time) *source.Future[source.ServiceLabelsResult] {
  898. const queryFmtServiceLabels = `avg_over_time(service_selector_labels{%s}[%s])`
  899. // env.GetPromClusterFilter(), durStr
  900. cfg := pds.promConfig
  901. durStr := timeutil.DurationString(end.Sub(start))
  902. if durStr == "" {
  903. panic("failed to parse duration string passed to QueryServiceLabels")
  904. }
  905. queryServiceLabels := fmt.Sprintf(queryFmtServiceLabels, cfg.ClusterFilter, durStr)
  906. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  907. return source.NewFuture(source.DecodeServiceLabelsResult, ctx.QueryAtTime(queryServiceLabels, end))
  908. }
  909. func (pds *PrometheusMetricsQuerier) QueryDeploymentLabels(start, end time.Time) *source.Future[source.DeploymentLabelsResult] {
  910. const queryFmtDeploymentLabels = `avg_over_time(deployment_match_labels{%s}[%s])`
  911. // env.GetPromClusterFilter(), durStr
  912. cfg := pds.promConfig
  913. durStr := timeutil.DurationString(end.Sub(start))
  914. if durStr == "" {
  915. panic("failed to parse duration string passed to QueryNamespaceAnnotations")
  916. }
  917. queryDeploymentLabels := fmt.Sprintf(queryFmtDeploymentLabels, cfg.ClusterFilter, durStr)
  918. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  919. return source.NewFuture(source.DecodeDeploymentLabelsResult, ctx.QueryAtTime(queryDeploymentLabels, end))
  920. }
  921. func (pds *PrometheusMetricsQuerier) QueryStatefulSetLabels(start, end time.Time) *source.Future[source.StatefulSetLabelsResult] {
  922. const queryFmtStatefulSetLabels = `avg_over_time(statefulSet_match_labels{%s}[%s])`
  923. // env.GetPromClusterFilter(), durStr
  924. cfg := pds.promConfig
  925. durStr := timeutil.DurationString(end.Sub(start))
  926. if durStr == "" {
  927. panic("failed to parse duration string passed to QueryStatefulSetLabels")
  928. }
  929. queryStatefulSetLabels := fmt.Sprintf(queryFmtStatefulSetLabels, cfg.ClusterFilter, durStr)
  930. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  931. return source.NewFuture(source.DecodeStatefulSetLabelsResult, ctx.QueryAtTime(queryStatefulSetLabels, end))
  932. }
  933. func (pds *PrometheusMetricsQuerier) QueryDaemonSetLabels(start, end time.Time) *source.Future[source.DaemonSetLabelsResult] {
  934. const queryFmtDaemonSetLabels = `sum(avg_over_time(kube_pod_owner{owner_kind="DaemonSet", %s}[%s])) by (pod, owner_name, namespace, %s)`
  935. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  936. cfg := pds.promConfig
  937. durStr := timeutil.DurationString(end.Sub(start))
  938. if durStr == "" {
  939. panic("failed to parse duration string passed to QueryDaemonSetLabels")
  940. }
  941. queryDaemonSetLabels := fmt.Sprintf(queryFmtDaemonSetLabels, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  942. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  943. return source.NewFuture(source.DecodeDaemonSetLabelsResult, ctx.QueryAtTime(queryDaemonSetLabels, end))
  944. }
  945. func (pds *PrometheusMetricsQuerier) QueryJobLabels(start, end time.Time) *source.Future[source.JobLabelsResult] {
  946. const queryFmtJobLabels = `sum(avg_over_time(kube_pod_owner{owner_kind="Job", %s}[%s])) by (pod, owner_name, namespace ,%s)`
  947. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  948. cfg := pds.promConfig
  949. durStr := timeutil.DurationString(end.Sub(start))
  950. if durStr == "" {
  951. panic("failed to parse duration string passed to QueryJobLabels")
  952. }
  953. queryJobLabels := fmt.Sprintf(queryFmtJobLabels, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  954. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  955. return source.NewFuture(source.DecodeJobLabelsResult, ctx.QueryAtTime(queryJobLabels, end))
  956. }
  957. func (pds *PrometheusMetricsQuerier) QueryPodsWithReplicaSetOwner(start, end time.Time) *source.Future[source.PodsWithReplicaSetOwnerResult] {
  958. const queryFmtPodsWithReplicaSetOwner = `sum(avg_over_time(kube_pod_owner{owner_kind="ReplicaSet", %s}[%s])) by (pod, owner_name, namespace ,%s)`
  959. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  960. cfg := pds.promConfig
  961. durStr := timeutil.DurationString(end.Sub(start))
  962. if durStr == "" {
  963. panic("failed to parse duration string passed to QueryPodsWithReplicaSetOwner")
  964. }
  965. queryPodsWithReplicaSetOwner := fmt.Sprintf(queryFmtPodsWithReplicaSetOwner, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  966. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  967. return source.NewFuture(source.DecodePodsWithReplicaSetOwnerResult, ctx.QueryAtTime(queryPodsWithReplicaSetOwner, end))
  968. }
  969. func (pds *PrometheusMetricsQuerier) QueryReplicaSetsWithoutOwners(start, end time.Time) *source.Future[source.ReplicaSetsWithoutOwnersResult] {
  970. const queryFmtReplicaSetsWithoutOwners = `avg(avg_over_time(kube_replicaset_owner{owner_kind="<none>", owner_name="<none>", %s}[%s])) by (replicaset, namespace, %s)`
  971. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  972. cfg := pds.promConfig
  973. durStr := timeutil.DurationString(end.Sub(start))
  974. if durStr == "" {
  975. panic("failed to parse duration string passed to QueryReplicaSetsWithoutOwners")
  976. }
  977. queryReplicaSetsWithoutOwners := fmt.Sprintf(queryFmtReplicaSetsWithoutOwners, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  978. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  979. return source.NewFuture(source.DecodeReplicaSetsWithoutOwnersResult, ctx.QueryAtTime(queryReplicaSetsWithoutOwners, end))
  980. }
  981. func (pds *PrometheusMetricsQuerier) QueryReplicaSetsWithRollout(start, end time.Time) *source.Future[source.ReplicaSetsWithRolloutResult] {
  982. const queryFmtReplicaSetsWithRolloutOwner = `avg(avg_over_time(kube_replicaset_owner{owner_kind="Rollout", %s}[%s])) by (replicaset, namespace, owner_kind, owner_name, %s)`
  983. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  984. cfg := pds.promConfig
  985. durStr := timeutil.DurationString(end.Sub(start))
  986. if durStr == "" {
  987. panic("failed to parse duration string passed to QueryReplicaSetsWithRollout")
  988. }
  989. queryReplicaSetsWithRolloutOwner := fmt.Sprintf(queryFmtReplicaSetsWithRolloutOwner, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  990. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  991. return source.NewFuture(source.DecodeReplicaSetsWithRolloutResult, ctx.QueryAtTime(queryReplicaSetsWithRolloutOwner, end))
  992. }
  993. func (pds *PrometheusMetricsQuerier) QueryDataCoverage(limitDays int) (time.Time, time.Time, error) {
  994. const (
  995. queryFmtOldestSample = `min_over_time(timestamp(group(node_cpu_hourly_cost{%s}))[%s:%s])`
  996. queryFmtNewestSample = `max_over_time(timestamp(group(node_cpu_hourly_cost{%s}))[%s:%s])`
  997. )
  998. cfg := pds.promConfig
  999. now := time.Now()
  1000. durStr := fmt.Sprintf("%dd", limitDays)
  1001. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1002. queryOldest := fmt.Sprintf(queryFmtOldestSample, cfg.ClusterFilter, durStr, "1h")
  1003. resOldestFut := ctx.QueryAtTime(queryOldest, now)
  1004. resOldest, err := resOldestFut.Await()
  1005. if err != nil {
  1006. return time.Time{}, time.Time{}, fmt.Errorf("querying oldest sample: %w", err)
  1007. }
  1008. if len(resOldest) == 0 || len(resOldest[0].Values) == 0 {
  1009. return time.Time{}, time.Time{}, fmt.Errorf("querying oldest sample: %w", err)
  1010. }
  1011. oldest := time.Unix(int64(resOldest[0].Values[0].Value), 0)
  1012. queryNewest := fmt.Sprintf(queryFmtNewestSample, cfg.ClusterFilter, durStr, "1h")
  1013. resNewestFut := ctx.QueryAtTime(queryNewest, now)
  1014. resNewest, err := resNewestFut.Await()
  1015. if err != nil {
  1016. return time.Time{}, time.Time{}, fmt.Errorf("querying newest sample: %w", err)
  1017. }
  1018. if len(resNewest) == 0 || len(resNewest[0].Values) == 0 {
  1019. return time.Time{}, time.Time{}, fmt.Errorf("querying newest sample: %w", err)
  1020. }
  1021. newest := time.Unix(int64(resNewest[0].Values[0].Value), 0)
  1022. return oldest, newest, nil
  1023. }
  1024. func (pds *PrometheusMetricsQuerier) durationStringFor(start, end time.Time, minsPerResolution int) string {
  1025. dur := end.Sub(start)
  1026. // If using a version of Prometheus where the resolution needs duration offset,
  1027. // we need to apply that here.
  1028. //
  1029. // E.g. avg(node_total_hourly_cost{}) by (node, provider_id)[60m:5m] with
  1030. // time=01:00:00 will return, for a node running the entire time, 12
  1031. // timestamps where the first is 00:05:00 and the last is 01:00:00.
  1032. // However, OpenCost expects for there to be 13 timestamps where the first
  1033. // begins at 00:00:00. To achieve this, we must modify our query to
  1034. // avg(node_total_hourly_cost{}) by (node, provider_id)[65m:5m]
  1035. if pds.promConfig.IsOffsetResolution {
  1036. // increase the query time by the resolution
  1037. dur = dur + (time.Duration(minsPerResolution) * time.Minute)
  1038. }
  1039. return timeutil.DurationString(dur)
  1040. }
  1041. func newEmptyResult[T any](decoder source.ResultDecoder[T]) *source.Future[T] {
  1042. ch := make(source.QueryResultsChan)
  1043. go func() {
  1044. results := source.NewQueryResults("")
  1045. ch <- results
  1046. }()
  1047. return source.NewFuture(decoder, ch)
  1048. }
  1049. func wrapResults[T any](query string, decoder source.ResultDecoder[T], results []*source.QueryResult) *source.Future[T] {
  1050. ch := make(source.QueryResultsChan)
  1051. go func() {
  1052. r := source.NewQueryResults(query)
  1053. r.Results = results
  1054. ch <- r
  1055. }()
  1056. return source.NewFuture(decoder, ch)
  1057. }