datasource.go 96 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205
  1. package prom
  2. import (
  3. "context"
  4. "fmt"
  5. "net/http"
  6. "strconv"
  7. "strings"
  8. "time"
  9. "github.com/julienschmidt/httprouter"
  10. "github.com/opencost/opencost/modules/prometheus-source/pkg/env"
  11. "github.com/opencost/opencost/core/pkg/clusters"
  12. "github.com/opencost/opencost/core/pkg/log"
  13. "github.com/opencost/opencost/core/pkg/protocol"
  14. "github.com/opencost/opencost/core/pkg/source"
  15. "github.com/opencost/opencost/core/pkg/util/httputil"
  16. "github.com/opencost/opencost/core/pkg/util/json"
  17. "github.com/opencost/opencost/core/pkg/util/timeutil"
  18. prometheus "github.com/prometheus/client_golang/api"
  19. prometheusAPI "github.com/prometheus/client_golang/api/prometheus/v1"
  20. )
  21. const (
  22. apiPrefix = "/api/v1"
  23. epAlertManagers = apiPrefix + "/alertmanagers"
  24. epLabelValues = apiPrefix + "/label/:name/values"
  25. epSeries = apiPrefix + "/series"
  26. epTargets = apiPrefix + "/targets"
  27. epSnapshot = apiPrefix + "/admin/tsdb/snapshot"
  28. epDeleteSeries = apiPrefix + "/admin/tsdb/delete_series"
  29. epCleanTombstones = apiPrefix + "/admin/tsdb/clean_tombstones"
  30. epConfig = apiPrefix + "/status/config"
  31. epFlags = apiPrefix + "/status/flags"
  32. epRules = apiPrefix + "/rules"
  33. )
  34. // helper for query range proxy requests
  35. func toStartEndStep(qp httputil.QueryParams) (start, end time.Time, step time.Duration, err error) {
  36. var e error
  37. ss := qp.Get("start", "")
  38. es := qp.Get("end", "")
  39. ds := qp.Get("duration", "")
  40. layout := "2006-01-02T15:04:05.000Z"
  41. start, e = time.Parse(layout, ss)
  42. if e != nil {
  43. err = fmt.Errorf("Error parsing time %s. Error: %s", ss, err)
  44. return
  45. }
  46. end, e = time.Parse(layout, es)
  47. if e != nil {
  48. err = fmt.Errorf("Error parsing time %s. Error: %s", es, err)
  49. return
  50. }
  51. step, e = time.ParseDuration(ds)
  52. if e != nil {
  53. err = fmt.Errorf("Error parsing duration %s. Error: %s", ds, err)
  54. return
  55. }
  56. err = nil
  57. return
  58. }
  59. // FIXME: Before merge, implement a more robust design. This is brittle and bug-prone,
  60. // FIXME: but decouples the prom requirements from the Provider implementations.
  61. var providerStorageQueries = map[string]func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string{
  62. "aws": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  63. return ""
  64. },
  65. "gcp": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  66. // TODO Set to the price for the appropriate storage class. It's not trivial to determine the local storage disk type
  67. // See https://cloud.google.com/compute/disks-image-pricing#persistentdisk
  68. localStorageCost := 0.04
  69. baseMetric := "container_fs_limit_bytes"
  70. if used {
  71. baseMetric = "container_fs_usage_bytes"
  72. }
  73. fmtCumulativeQuery := `sum(
  74. sum_over_time(%s{device!="tmpfs", id="/", %s}[%s:1m])
  75. ) by (%s) / 60 / 730 / 1024 / 1024 / 1024 * %f`
  76. fmtMonthlyQuery := `sum(
  77. avg_over_time(%s{device!="tmpfs", id="/", %s}[%s:1m])
  78. ) by (%s) / 1024 / 1024 / 1024 * %f`
  79. fmtQuery := fmtCumulativeQuery
  80. if rate {
  81. fmtQuery = fmtMonthlyQuery
  82. }
  83. fmtWindow := timeutil.DurationString(end.Sub(start))
  84. return fmt.Sprintf(fmtQuery, baseMetric, config.ClusterFilter, fmtWindow, config.ClusterLabel, localStorageCost)
  85. },
  86. "azure": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  87. return ""
  88. },
  89. "alibaba": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  90. return ""
  91. },
  92. "scaleway": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  93. return ""
  94. },
  95. "otc": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  96. return ""
  97. },
  98. "oracle": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  99. return ""
  100. },
  101. "csv": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  102. return ""
  103. },
  104. "custom": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  105. return ""
  106. },
  107. }
  108. // creates a new help error which indicates the caller can retry and is non-fatal.
  109. func newHelpRetryError(format string, args ...any) error {
  110. formatWithHelp := format + "\nTroubleshooting help available at: %s"
  111. args = append(args, PrometheusTroubleshootingURL)
  112. cause := fmt.Errorf(formatWithHelp, args...)
  113. return source.NewHelpRetryError(cause)
  114. }
  115. // PrometheusDataSource is the OpenCost data source implementation leveraging Prometheus. Prometheus provides longer retention periods and
  116. // more detailed metrics than the OpenCost Collector, which is useful for historical analysis and cost forecasting.
  117. type PrometheusDataSource struct {
  118. promConfig *OpenCostPrometheusConfig
  119. promClient prometheus.Client
  120. promContexts *ContextFactory
  121. thanosConfig *OpenCostThanosConfig
  122. thanosClient prometheus.Client
  123. thanosContexts *ContextFactory
  124. }
  125. // NewDefaultPrometheusDataSource creates and initializes a new `PrometheusDataSource` with configuration
  126. // parsed from environment variables. This function will block until a connection to prometheus is established,
  127. // or fails. It is recommended to run this function in a goroutine on a retry cycle.
  128. func NewDefaultPrometheusDataSource() (*PrometheusDataSource, error) {
  129. config, err := NewOpenCostPrometheusConfigFromEnv()
  130. if err != nil {
  131. return nil, fmt.Errorf("failed to create prometheus config from env: %w", err)
  132. }
  133. var thanosConfig *OpenCostThanosConfig
  134. if env.IsThanosEnabled() {
  135. // thanos initialization is not fatal, so we log the error and continue
  136. thanosConfig, err = NewOpenCostThanosConfigFromEnv()
  137. if err != nil {
  138. log.Warnf("Thanos was enabled, but failed to create thanos config from env: %s. Continuing...", err.Error())
  139. }
  140. }
  141. return NewPrometheusDataSource(config, thanosConfig)
  142. }
  143. // NewPrometheusDataSource initializes clients for Prometheus and Thanos, and returns a new PrometheusDataSource.
  144. func NewPrometheusDataSource(promConfig *OpenCostPrometheusConfig, thanosConfig *OpenCostThanosConfig) (*PrometheusDataSource, error) {
  145. promClient, err := NewPrometheusClient(promConfig.ServerEndpoint, promConfig.ClientConfig)
  146. if err != nil {
  147. return nil, fmt.Errorf("failed to build prometheus client: %w", err)
  148. }
  149. // validation of the prometheus client
  150. m, err := Validate(promClient, promConfig)
  151. if err != nil || !m.Running {
  152. if err != nil {
  153. return nil, newHelpRetryError("failed to query prometheus at %s: %w", promConfig.ServerEndpoint, err)
  154. } else if !m.Running {
  155. return nil, newHelpRetryError("prometheus at %s is not running", promConfig.ServerEndpoint)
  156. }
  157. } else {
  158. log.Infof("Success: retrieved the 'up' query against prometheus at: %s", promConfig.ServerEndpoint)
  159. }
  160. // we don't consider this a fatal error, but we log for visibility
  161. api := prometheusAPI.NewAPI(promClient)
  162. _, err = api.Buildinfo(context.Background())
  163. if err != nil {
  164. log.Infof("No valid prometheus config file at %s. Error: %s.\nTroubleshooting help available at: %s.\n**Ignore if using cortex/mimir/thanos here**", promConfig.ServerEndpoint, err.Error(), PrometheusTroubleshootingURL)
  165. } else {
  166. log.Infof("Retrieved a prometheus config file from: %s", promConfig.ServerEndpoint)
  167. }
  168. // Fix scrape interval if zero by attempting to lookup the interval for the configured job
  169. if promConfig.ScrapeInterval == 0 {
  170. promConfig.ScrapeInterval = time.Minute
  171. // Lookup scrape interval for kubecost job, update if found
  172. si, err := ScrapeIntervalFor(promClient, promConfig.JobName)
  173. if err == nil {
  174. promConfig.ScrapeInterval = si
  175. }
  176. }
  177. log.Infof("Using scrape interval of %f", promConfig.ScrapeInterval.Seconds())
  178. promContexts := NewContextFactory(promClient, promConfig)
  179. var thanosClient prometheus.Client
  180. var thanosContexts *ContextFactory
  181. // if the thanos configuration is non-nil, we assume intent to use thanos. However, failure to
  182. // initialize the thanos client is not fatal, and we will log the error and continue.
  183. if thanosConfig != nil {
  184. thanosHost := thanosConfig.ServerEndpoint
  185. if thanosHost != "" {
  186. thanosCli, _ := NewThanosClient(thanosHost, thanosConfig)
  187. _, err = Validate(thanosCli, thanosConfig.OpenCostPrometheusConfig)
  188. if err != nil {
  189. log.Warnf("Failed to query Thanos at %s. Error: %s.", thanosHost, err.Error())
  190. thanosClient = thanosCli
  191. } else {
  192. log.Infof("Success: retrieved the 'up' query against Thanos at: %s", thanosHost)
  193. thanosClient = thanosCli
  194. }
  195. thanosContexts = NewContextFactory(thanosClient, thanosConfig.OpenCostPrometheusConfig)
  196. } else {
  197. log.Infof("Error resolving environment variable: $%s", env.ThanosQueryUrlEnvVar)
  198. }
  199. }
  200. return &PrometheusDataSource{
  201. promConfig: promConfig,
  202. promClient: promClient,
  203. promContexts: promContexts,
  204. thanosConfig: thanosConfig,
  205. thanosClient: thanosClient,
  206. thanosContexts: thanosContexts,
  207. }, nil
  208. }
  209. var proto = protocol.HTTP()
  210. // prometheusMetadata returns the metadata for the prometheus server
  211. func (pds *PrometheusDataSource) prometheusMetadata(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  212. w.Header().Set("Content-Type", "application/json")
  213. w.Header().Set("Access-Control-Allow-Origin", "*")
  214. resp := proto.ToResponse(Validate(pds.promClient, pds.promConfig))
  215. proto.WriteResponse(w, resp)
  216. }
  217. // prometheusRecordingRules is a proxy for /rules against prometheus
  218. func (pds *PrometheusDataSource) prometheusRecordingRules(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  219. w.Header().Set("Content-Type", "application/json")
  220. w.Header().Set("Access-Control-Allow-Origin", "*")
  221. u := pds.promClient.URL(epRules, nil)
  222. req, err := http.NewRequest(http.MethodGet, u.String(), nil)
  223. if err != nil {
  224. fmt.Fprintf(w, "error creating Prometheus rule request: %s", err)
  225. return
  226. }
  227. _, body, err := pds.promClient.Do(r.Context(), req)
  228. if err != nil {
  229. fmt.Fprintf(w, "error making Prometheus rule request: %s", err)
  230. return
  231. }
  232. w.Write(body)
  233. }
  234. // prometheusConfig returns the current configuration of the prometheus server
  235. func (pds *PrometheusDataSource) prometheusConfig(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  236. w.Header().Set("Content-Type", "application/json")
  237. w.Header().Set("Access-Control-Allow-Origin", "*")
  238. pConfig := map[string]string{
  239. "address": pds.promConfig.ServerEndpoint,
  240. }
  241. body, err := json.Marshal(pConfig)
  242. if err != nil {
  243. fmt.Fprintf(w, "Error marshalling prometheus config")
  244. } else {
  245. w.Write(body)
  246. }
  247. }
  248. // prometheusTargets is a proxy for /targets against prometheus
  249. func (pds *PrometheusDataSource) prometheusTargets(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  250. w.Header().Set("Content-Type", "application/json")
  251. w.Header().Set("Access-Control-Allow-Origin", "*")
  252. u := pds.promClient.URL(epTargets, nil)
  253. req, err := http.NewRequest(http.MethodGet, u.String(), nil)
  254. if err != nil {
  255. fmt.Fprintf(w, "error creating Prometheus rule request: %s", err)
  256. return
  257. }
  258. _, body, err := pds.promClient.Do(r.Context(), req)
  259. if err != nil {
  260. fmt.Fprintf(w, "error making Prometheus rule request: %s", err)
  261. return
  262. }
  263. w.Write(body)
  264. }
  265. // status returns the status of the prometheus client
  266. func (pds *PrometheusDataSource) status(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  267. w.Header().Set("Content-Type", "application/json")
  268. w.Header().Set("Access-Control-Allow-Origin", "*")
  269. promServer := pds.promConfig.ServerEndpoint
  270. api := prometheusAPI.NewAPI(pds.promClient)
  271. result, err := api.Buildinfo(r.Context())
  272. if err != nil {
  273. fmt.Fprintf(w, "Using Prometheus at %s, Error: %s", promServer, err)
  274. } else {
  275. fmt.Fprintf(w, "Using Prometheus at %s, version: %s", promServer, result.Version)
  276. }
  277. }
  278. // prometheusQuery is a proxy for /query against prometheus
  279. func (pds *PrometheusDataSource) prometheusQuery(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  280. w.Header().Set("Content-Type", "application/json")
  281. w.Header().Set("Access-Control-Allow-Origin", "*")
  282. qp := httputil.NewQueryParams(r.URL.Query())
  283. query := qp.Get("query", "")
  284. if query == "" {
  285. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Query Parameter 'query' is unset'")))
  286. return
  287. }
  288. // Attempt to parse time as either a unix timestamp or as an RFC3339 value
  289. var timeVal time.Time
  290. timeStr := qp.Get("time", "")
  291. if len(timeStr) > 0 {
  292. if t, err := strconv.ParseInt(timeStr, 10, 64); err == nil {
  293. timeVal = time.Unix(t, 0)
  294. } else if t, err := time.Parse(time.RFC3339, timeStr); err == nil {
  295. timeVal = t
  296. }
  297. // If time is given, but not parse-able, return an error
  298. if timeVal.IsZero() {
  299. http.Error(w, fmt.Sprintf("time must be a unix timestamp or RFC3339 value; illegal value given: %s", timeStr), http.StatusBadRequest)
  300. }
  301. }
  302. ctx := pds.promContexts.NewNamedContext(FrontendContextName)
  303. body, err := ctx.RawQuery(query, timeVal)
  304. if err != nil {
  305. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Error running query %s. Error: %s", query, err)))
  306. return
  307. }
  308. w.Write(body) // prometheusQueryRange is a proxy for /query_range against prometheus
  309. }
  310. func (pds *PrometheusDataSource) prometheusQueryRange(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  311. w.Header().Set("Content-Type", "application/json")
  312. w.Header().Set("Access-Control-Allow-Origin", "*")
  313. qp := httputil.NewQueryParams(r.URL.Query())
  314. query := qp.Get("query", "")
  315. if query == "" {
  316. fmt.Fprintf(w, "Error parsing query from request parameters.")
  317. return
  318. }
  319. start, end, duration, err := toStartEndStep(qp)
  320. if err != nil {
  321. fmt.Fprintf(w, "error: %s", err)
  322. return
  323. }
  324. ctx := pds.promContexts.NewNamedContext(FrontendContextName)
  325. body, err := ctx.RawQueryRange(query, start, end, duration)
  326. if err != nil {
  327. fmt.Fprintf(w, "Error running query %s. Error: %s", query, err)
  328. return
  329. }
  330. w.Write(body)
  331. }
  332. // thanosQuery is a proxy for /query against thanos
  333. func (pds *PrometheusDataSource) thanosQuery(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  334. w.Header().Set("Content-Type", "application/json")
  335. w.Header().Set("Access-Control-Allow-Origin", "*")
  336. if pds.thanosClient == nil {
  337. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("ThanosDisabled")))
  338. return
  339. }
  340. qp := httputil.NewQueryParams(r.URL.Query())
  341. query := qp.Get("query", "")
  342. if query == "" {
  343. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Query Parameter 'query' is unset'")))
  344. return
  345. }
  346. // Attempt to parse time as either a unix timestamp or as an RFC3339 value
  347. var timeVal time.Time
  348. timeStr := qp.Get("time", "")
  349. if len(timeStr) > 0 {
  350. if t, err := strconv.ParseInt(timeStr, 10, 64); err == nil {
  351. timeVal = time.Unix(t, 0)
  352. } else if t, err := time.Parse(time.RFC3339, timeStr); err == nil {
  353. timeVal = t
  354. }
  355. // If time is given, but not parse-able, return an error
  356. if timeVal.IsZero() {
  357. http.Error(w, fmt.Sprintf("time must be a unix timestamp or RFC3339 value; illegal value given: %s", timeStr), http.StatusBadRequest)
  358. }
  359. }
  360. ctx := pds.thanosContexts.NewNamedContext(FrontendContextName)
  361. body, err := ctx.RawQuery(query, timeVal)
  362. if err != nil {
  363. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Error running query %s. Error: %s", query, err)))
  364. return
  365. }
  366. w.Write(body)
  367. }
  368. // thanosQueryRange is a proxy for /query_range against thanos
  369. func (pds *PrometheusDataSource) thanosQueryRange(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  370. w.Header().Set("Content-Type", "application/json")
  371. w.Header().Set("Access-Control-Allow-Origin", "*")
  372. if pds.thanosClient == nil {
  373. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("ThanosDisabled")))
  374. return
  375. }
  376. qp := httputil.NewQueryParams(r.URL.Query())
  377. query := qp.Get("query", "")
  378. if query == "" {
  379. fmt.Fprintf(w, "Error parsing query from request parameters.")
  380. return
  381. }
  382. start, end, duration, err := toStartEndStep(qp)
  383. if err != nil {
  384. fmt.Fprintf(w, "error: %s", err)
  385. return
  386. }
  387. ctx := pds.thanosContexts.NewNamedContext(FrontendContextName)
  388. body, err := ctx.RawQueryRange(query, start, end, duration)
  389. if err != nil {
  390. fmt.Fprintf(w, "Error running query %s. Error: %s", query, err)
  391. return
  392. }
  393. w.Write(body)
  394. }
  395. // promtheusQueueState returns the current state of the prometheus and thanos request queues
  396. func (pds *PrometheusDataSource) prometheusQueueState(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  397. w.Header().Set("Content-Type", "application/json")
  398. w.Header().Set("Access-Control-Allow-Origin", "*")
  399. promQueueState, err := GetPrometheusQueueState(pds.promClient, pds.promConfig)
  400. if err != nil {
  401. proto.WriteResponse(w, proto.ToResponse(nil, err))
  402. return
  403. }
  404. result := map[string]*PrometheusQueueState{
  405. "prometheus": promQueueState,
  406. }
  407. if pds.thanosClient != nil {
  408. thanosQueueState, err := GetPrometheusQueueState(pds.thanosClient, pds.thanosConfig.OpenCostPrometheusConfig)
  409. if err != nil {
  410. log.Warnf("Error getting Thanos queue state: %s", err)
  411. } else {
  412. result["thanos"] = thanosQueueState
  413. }
  414. }
  415. proto.WriteResponse(w, proto.ToResponse(result, nil))
  416. }
  417. // prometheusMetrics retrieves availability of Prometheus and Thanos metrics
  418. func (pds *PrometheusDataSource) prometheusMetrics(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  419. w.Header().Set("Content-Type", "application/json")
  420. w.Header().Set("Access-Control-Allow-Origin", "*")
  421. promMetrics := GetPrometheusMetrics(pds.promClient, pds.promConfig, "")
  422. result := map[string][]*PrometheusDiagnostic{
  423. "prometheus": promMetrics,
  424. }
  425. if pds.thanosClient != nil {
  426. thanosMetrics := GetPrometheusMetrics(pds.thanosClient, pds.thanosConfig.OpenCostPrometheusConfig, pds.thanosConfig.Offset)
  427. result["thanos"] = thanosMetrics
  428. }
  429. proto.WriteResponse(w, proto.ToResponse(result, nil))
  430. }
  431. func (pds *PrometheusDataSource) PrometheusClient() prometheus.Client {
  432. return pds.promClient
  433. }
  434. func (pds *PrometheusDataSource) PrometheusConfig() *OpenCostPrometheusConfig {
  435. return pds.promConfig
  436. }
  437. func (pds *PrometheusDataSource) PrometheusContexts() *ContextFactory {
  438. return pds.promContexts
  439. }
  440. func (pds *PrometheusDataSource) ThanosClient() prometheus.Client {
  441. return pds.thanosClient
  442. }
  443. func (pds *PrometheusDataSource) ThanosConfig() *OpenCostThanosConfig {
  444. return pds.thanosConfig
  445. }
  446. func (pds *PrometheusDataSource) ThanosContexts() *ContextFactory {
  447. return pds.thanosContexts
  448. }
  449. func (pds *PrometheusDataSource) NewClusterMap(clusterInfoProvider clusters.ClusterInfoProvider) clusters.ClusterMap {
  450. if pds.thanosClient != nil {
  451. return newPrometheusClusterMap(pds.thanosContexts, clusterInfoProvider, 10*time.Minute)
  452. }
  453. return newPrometheusClusterMap(pds.promContexts, clusterInfoProvider, 5*time.Minute)
  454. }
  455. func (pds *PrometheusDataSource) RegisterEndPoints(router *httprouter.Router) {
  456. // endpoints migrated from server
  457. router.GET("/validatePrometheus", pds.prometheusMetadata)
  458. router.GET("/prometheusRecordingRules", pds.prometheusRecordingRules)
  459. router.GET("/prometheusConfig", pds.prometheusConfig)
  460. router.GET("/prometheusTargets", pds.prometheusTargets)
  461. router.GET("/status", pds.status)
  462. // prom query proxies
  463. router.GET("/prometheusQuery", pds.prometheusQuery)
  464. router.GET("/prometheusQueryRange", pds.prometheusQueryRange)
  465. router.GET("/thanosQuery", pds.thanosQuery)
  466. router.GET("/thanosQueryRange", pds.thanosQueryRange)
  467. // diagnostics
  468. router.GET("/diagnostics/requestQueue", pds.prometheusQueueState)
  469. router.GET("/diagnostics/prometheusMetrics", pds.prometheusMetrics)
  470. }
  471. func (pds *PrometheusDataSource) RefreshInterval() time.Duration {
  472. return pds.promConfig.ScrapeInterval
  473. }
  474. func (pds *PrometheusDataSource) BatchDuration() time.Duration {
  475. return pds.promConfig.MaxQueryDuration
  476. }
  477. func (pds *PrometheusDataSource) Resolution() time.Duration {
  478. return pds.promConfig.DataResolution
  479. }
  480. func (pds *PrometheusDataSource) MetaData() map[string]string {
  481. thanosEnabled := pds.thanosClient != nil
  482. metadata := map[string]string{
  483. clusters.ClusterInfoThanosEnabledKey: fmt.Sprintf("%t", thanosEnabled),
  484. }
  485. if thanosEnabled {
  486. metadata[clusters.ClusterInfoThanosOffsetKey] = pds.thanosConfig.Offset
  487. }
  488. return metadata
  489. }
  490. //--------------------------------------------------------------------------
  491. // InstantMetricsQuerier
  492. //--------------------------------------------------------------------------
  493. func (pds *PrometheusDataSource) QueryPVPricePerGiBHour(start, end time.Time) *source.Future[source.PVPricePerGiBHourResult] {
  494. const pvCostQuery = `avg(avg_over_time(pv_hourly_cost{%s}[%s])) by (%s, persistentvolume, volumename, provider_id)`
  495. durStr := timeutil.DurationString(end.Sub(start))
  496. if durStr == "" {
  497. panic("failed to parse duration string passed to QueryPVCost")
  498. }
  499. queryPVCost := fmt.Sprintf(pvCostQuery, pds.promConfig.ClusterFilter, durStr, pds.promConfig.ClusterLabel)
  500. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  501. return source.NewFuture(source.DecodePVPricePerGiBHourResult, ctx.QueryAtTime(queryPVCost, end))
  502. }
  503. func (pds *PrometheusDataSource) QueryPVUsedAverage(start, end time.Time) *source.Future[source.PVUsedAvgResult] {
  504. // `avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  505. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  506. const pvUsedAverageQuery = `avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  507. cfg := pds.promConfig
  508. durStr := timeutil.DurationString(end.Sub(start))
  509. if durStr == "" {
  510. panic("failed to parse duration string passed to QueryPVUsedAverage")
  511. }
  512. queryPVUsedAvg := fmt.Sprintf(pvUsedAverageQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  513. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  514. return source.NewFuture(source.DecodePVUsedAvgResult, ctx.QueryAtTime(queryPVUsedAvg, end))
  515. }
  516. func (pds *PrometheusDataSource) QueryPVUsedMax(start, end time.Time) *source.Future[source.PVUsedMaxResult] {
  517. // `max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  518. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  519. const pvUsedMaxQuery = `max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  520. cfg := pds.promConfig
  521. durStr := timeutil.DurationString(end.Sub(start))
  522. if durStr == "" {
  523. panic("failed to parse duration string passed to QueryPVUsedMax")
  524. }
  525. queryPVUsedMax := fmt.Sprintf(pvUsedMaxQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  526. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  527. return source.NewFuture(source.DecodePVUsedMaxResult, ctx.QueryAtTime(queryPVUsedMax, end))
  528. }
  529. func (pds *PrometheusDataSource) QueryPVCInfo(start, end time.Time) *source.Future[source.PVCInfoResult] {
  530. const queryFmtPVCInfo = `avg(kube_persistentvolumeclaim_info{volumename != "", %s}) by (persistentvolumeclaim, storageclass, volumename, namespace, %s)[%s:%s]`
  531. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  532. cfg := pds.promConfig
  533. resolution := cfg.DataResolution
  534. resStr := timeutil.DurationString(resolution)
  535. durStr := timeutil.DurationString(end.Sub(start))
  536. if durStr == "" {
  537. panic("failed to parse duration string passed to QueryPVCInfo")
  538. }
  539. queryPVCInfo := fmt.Sprintf(queryFmtPVCInfo, cfg.ClusterFilter, cfg.ClusterLabel, durStr, resStr)
  540. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  541. return source.NewFuture(source.DecodePVCInfoResult, ctx.QueryAtTime(queryPVCInfo, end))
  542. }
  543. func (pds *PrometheusDataSource) QueryPVActiveMinutes(start, end time.Time) *source.Future[source.PVActiveMinutesResult] {
  544. const pvActiveMinsQuery = `avg(kube_persistentvolume_capacity_bytes{%s}) by (%s, persistentvolume)[%s:%dm]`
  545. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  546. cfg := pds.promConfig
  547. minsPerResolution := cfg.DataResolutionMinutes
  548. durStr := timeutil.DurationString(end.Sub(start))
  549. if durStr == "" {
  550. panic("failed to parse duration string passed to QueryPVActiveMinutes")
  551. }
  552. queryPVActiveMins := fmt.Sprintf(pvActiveMinsQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  553. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  554. return source.NewFuture(source.DecodePVActiveMinutesResult, ctx.QueryAtTime(queryPVActiveMins, end))
  555. }
  556. func (pds *PrometheusDataSource) QueryLocalStorageCost(start, end time.Time) *source.Future[source.LocalStorageCostResult] {
  557. // `sum_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  558. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  559. const localStorageCostQuery = `sum_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  560. cfg := pds.promConfig
  561. resolution := cfg.DataResolution
  562. durStr := timeutil.DurationString(end.Sub(start))
  563. if durStr == "" {
  564. panic("failed to parse duration string passed to QueryLocalStorageCost")
  565. }
  566. //Ensuring if data resolution is less than 60s default it to 1m
  567. var minsPerResolution int
  568. if minsPerResolution = int(resolution.Minutes()); int(resolution.Minutes()) == 0 {
  569. minsPerResolution = 1
  570. log.DedupedWarningf(3, "QueryLocalStorageCost: Configured resolution (%d seconds) is below the 60 seconds threshold. Overriding with 1 minute.", int(resolution.Seconds()))
  571. }
  572. // hourlyToCumulative is a scaling factor that, when multiplied by an
  573. // hourly value, converts it to a cumulative value; i.e. [$/hr] *
  574. // [min/res]*[hr/min] = [$/res]
  575. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  576. costPerGBHr := 0.04 / 730.0
  577. queryLocalStorageCost := fmt.Sprintf(localStorageCostQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  578. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  579. return source.NewFuture(source.DecodeLocalStorageCostResult, ctx.QueryAtTime(queryLocalStorageCost, end))
  580. }
  581. func (pds *PrometheusDataSource) QueryLocalStorageUsedCost(start, end time.Time) *source.Future[source.LocalStorageUsedCostResult] {
  582. // `sum_over_time(sum(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  583. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  584. const localStorageUsedCostQuery = `sum_over_time(sum(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  585. cfg := pds.promConfig
  586. minsPerResolution := cfg.DataResolutionMinutes
  587. durStr := timeutil.DurationString(end.Sub(start))
  588. if durStr == "" {
  589. panic("failed to parse duration string passed to QueryLocalStorageUsedCost")
  590. }
  591. // hourlyToCumulative is a scaling factor that, when multiplied by an
  592. // hourly value, converts it to a cumulative value; i.e. [$/hr] *
  593. // [min/res]*[hr/min] = [$/res]
  594. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  595. costPerGBHr := 0.04 / 730.0
  596. queryLocalStorageUsedCost := fmt.Sprintf(localStorageUsedCostQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  597. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  598. return source.NewFuture(source.DecodeLocalStorageUsedCostResult, ctx.QueryAtTime(queryLocalStorageUsedCost, end))
  599. }
  600. func (pds *PrometheusDataSource) QueryLocalStorageUsedAvg(start, end time.Time) *source.Future[source.LocalStorageUsedAvgResult] {
  601. // `avg(sum(avg_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  602. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  603. const localStorageUsedAvgQuery = `avg(sum(avg_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  604. cfg := pds.promConfig
  605. durStr := timeutil.DurationString(end.Sub(start))
  606. if durStr == "" {
  607. panic("failed to parse duration string passed to QueryLocalStorageUsedAvg")
  608. }
  609. queryLocalStorageUsedAvg := fmt.Sprintf(localStorageUsedAvgQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  610. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  611. return source.NewFuture(source.DecodeLocalStorageUsedAvgResult, ctx.QueryAtTime(queryLocalStorageUsedAvg, end))
  612. }
  613. func (pds *PrometheusDataSource) QueryLocalStorageUsedMax(start, end time.Time) *source.Future[source.LocalStorageUsedMaxResult] {
  614. // `max(sum(max_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  615. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  616. const localStorageUsedMaxQuery = `max(sum(max_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  617. cfg := pds.promConfig
  618. durStr := timeutil.DurationString(end.Sub(start))
  619. if durStr == "" {
  620. panic("failed to parse duration string passed to QueryLocalStorageUsedMax")
  621. }
  622. queryLocalStorageUsedMax := fmt.Sprintf(localStorageUsedMaxQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  623. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  624. return source.NewFuture(source.DecodeLocalStorageUsedMaxResult, ctx.QueryAtTime(queryLocalStorageUsedMax, end))
  625. }
  626. func (pds *PrometheusDataSource) QueryLocalStorageBytes(start, end time.Time) *source.Future[source.LocalStorageBytesResult] {
  627. // `avg_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm])`
  628. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  629. const localStorageBytesQuery = `avg_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm])`
  630. cfg := pds.promConfig
  631. minsPerResolution := cfg.DataResolutionMinutes
  632. durStr := timeutil.DurationString(end.Sub(start))
  633. if durStr == "" {
  634. panic("failed to parse duration string passed to QueryLocalStorageBytes")
  635. }
  636. queryLocalStorageBytes := fmt.Sprintf(localStorageBytesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  637. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  638. return source.NewFuture(source.DecodeLocalStorageBytesResult, ctx.QueryAtTime(queryLocalStorageBytes, end))
  639. }
  640. func (pds *PrometheusDataSource) QueryLocalStorageActiveMinutes(start, end time.Time) *source.Future[source.LocalStorageActiveMinutesResult] {
  641. // `count(node_total_hourly_cost{%s}) by (%s, node)[%s:%dm]`
  642. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  643. const localStorageActiveMinutesQuery = `count(node_total_hourly_cost{%s}) by (%s, node)[%s:%dm]`
  644. cfg := pds.promConfig
  645. minsPerResolution := cfg.DataResolutionMinutes
  646. durStr := timeutil.DurationString(end.Sub(start))
  647. if durStr == "" {
  648. panic("failed to parse duration string passed to QueryLocalStorageActiveMinutes")
  649. }
  650. queryLocalStorageActiveMins := fmt.Sprintf(localStorageActiveMinutesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  651. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  652. return source.NewFuture(source.DecodeLocalStorageActiveMinutesResult, ctx.QueryAtTime(queryLocalStorageActiveMins, end))
  653. }
  654. func (pds *PrometheusDataSource) QueryLocalStorageBytesByProvider(provider string, start, end time.Time) *source.Future[source.LocalStorageBytesByProviderResult] {
  655. var localStorageBytesQuery string
  656. key := strings.ToLower(provider)
  657. if f, ok := providerStorageQueries[key]; ok {
  658. localStorageBytesQuery = f(pds.promConfig, start, end, false, false)
  659. } else {
  660. localStorageBytesQuery = ""
  661. }
  662. if localStorageBytesQuery == "" {
  663. return newEmptyResult(source.DecodeLocalStorageBytesByProviderResult)
  664. }
  665. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  666. return source.NewFuture(source.DecodeLocalStorageBytesByProviderResult, ctx.QueryAtTime(localStorageBytesQuery, end))
  667. }
  668. func (pds *PrometheusDataSource) QueryLocalStorageUsedByProvider(provider string, start, end time.Time) *source.Future[source.LocalStorageUsedByProviderResult] {
  669. var localStorageUsedQuery string
  670. key := strings.ToLower(provider)
  671. if f, ok := providerStorageQueries[key]; ok {
  672. localStorageUsedQuery = f(pds.promConfig, start, end, false, true)
  673. } else {
  674. localStorageUsedQuery = ""
  675. }
  676. if localStorageUsedQuery == "" {
  677. return newEmptyResult(source.DecodeLocalStorageUsedByProviderResult)
  678. }
  679. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  680. return source.NewFuture(source.DecodeLocalStorageUsedByProviderResult, ctx.QueryAtTime(localStorageUsedQuery, end))
  681. }
  682. func (pds *PrometheusDataSource) QueryNodeCPUCoresCapacity(start, end time.Time) *source.Future[source.NodeCPUCoresCapacityResult] {
  683. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  684. const nodeCPUCoresCapacityQuery = `avg(avg_over_time(kube_node_status_capacity_cpu_cores{%s}[%s])) by (%s, node)`
  685. cfg := pds.promConfig
  686. durStr := timeutil.DurationString(end.Sub(start))
  687. if durStr == "" {
  688. panic("failed to parse duration string passed to QueryNodeCPUCoresCapacity")
  689. }
  690. queryNodeCPUCoresCapacity := fmt.Sprintf(nodeCPUCoresCapacityQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  691. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  692. return source.NewFuture(source.DecodeNodeCPUCoresCapacityResult, ctx.QueryAtTime(queryNodeCPUCoresCapacity, end))
  693. }
  694. func (pds *PrometheusDataSource) QueryNodeCPUCoresAllocatable(start, end time.Time) *source.Future[source.NodeCPUCoresAllocatableResult] {
  695. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  696. const nodeCPUCoresAllocatableQuery = `avg(avg_over_time(kube_node_status_allocatable_cpu_cores{%s}[%s])) by (%s, node)`
  697. // `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  698. cfg := pds.promConfig
  699. durStr := timeutil.DurationString(end.Sub(start))
  700. if durStr == "" {
  701. panic("failed to parse duration string passed to QueryNodeCPUCoresAllocatable")
  702. }
  703. queryNodeCPUCoresAllocatable := fmt.Sprintf(nodeCPUCoresAllocatableQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  704. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  705. return source.NewFuture(source.DecodeNodeCPUCoresAllocatableResult, ctx.QueryAtTime(queryNodeCPUCoresAllocatable, end))
  706. }
  707. func (pds *PrometheusDataSource) QueryNodeRAMBytesCapacity(start, end time.Time) *source.Future[source.NodeRAMBytesCapacityResult] {
  708. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  709. const nodeRAMBytesCapacityQuery = `avg(avg_over_time(kube_node_status_capacity_memory_bytes{%s}[%s])) by (%s, node)`
  710. cfg := pds.promConfig
  711. durStr := timeutil.DurationString(end.Sub(start))
  712. if durStr == "" {
  713. panic("failed to parse duration string passed to QueryNodeRAMBytesCapacity")
  714. }
  715. queryNodeRAMBytesCapacity := fmt.Sprintf(nodeRAMBytesCapacityQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  716. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  717. return source.NewFuture(source.DecodeNodeRAMBytesCapacityResult, ctx.QueryAtTime(queryNodeRAMBytesCapacity, end))
  718. }
  719. func (pds *PrometheusDataSource) QueryNodeRAMBytesAllocatable(start, end time.Time) *source.Future[source.NodeRAMBytesAllocatableResult] {
  720. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  721. const nodeRAMBytesAllocatableQuery = `avg(avg_over_time(kube_node_status_allocatable_memory_bytes{%s}[%s])) by (%s, node)`
  722. cfg := pds.promConfig
  723. durStr := timeutil.DurationString(end.Sub(start))
  724. if durStr == "" {
  725. panic("failed to parse duration string passed to QueryNodeRAMBytesAllocatable")
  726. }
  727. queryNodeRAMBytesAllocatable := fmt.Sprintf(nodeRAMBytesAllocatableQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  728. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  729. return source.NewFuture(source.DecodeNodeRAMBytesAllocatableResult, ctx.QueryAtTime(queryNodeRAMBytesAllocatable, end))
  730. }
  731. func (pds *PrometheusDataSource) QueryNodeGPUCount(start, end time.Time) *source.Future[source.NodeGPUCountResult] {
  732. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  733. const nodeGPUCountQuery = `avg(avg_over_time(node_gpu_count{%s}[%s])) by (%s, node, provider_id)`
  734. cfg := pds.promConfig
  735. durStr := timeutil.DurationString(end.Sub(start))
  736. if durStr == "" {
  737. panic("failed to parse duration string passed to QueryNodeGPUCount")
  738. }
  739. queryNodeGPUCount := fmt.Sprintf(nodeGPUCountQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  740. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  741. return source.NewFuture(source.DecodeNodeGPUCountResult, ctx.QueryAtTime(queryNodeGPUCount, end))
  742. }
  743. func (pds *PrometheusDataSource) QueryNodeLabels(start, end time.Time) *source.Future[source.NodeLabelsResult] {
  744. const labelsQuery = `avg_over_time(kube_node_labels{%s}[%s])`
  745. // env.GetPromClusterFilter(), durStr
  746. cfg := pds.promConfig
  747. durStr := timeutil.DurationString(end.Sub(start))
  748. if durStr == "" {
  749. panic("failed to parse duration string passed to QueryNodeLabels")
  750. }
  751. queryLabels := fmt.Sprintf(labelsQuery, cfg.ClusterFilter, durStr)
  752. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  753. return source.NewFuture(source.DecodeNodeLabelsResult, ctx.QueryAtTime(queryLabels, end))
  754. }
  755. func (pds *PrometheusDataSource) QueryNodeActiveMinutes(start, end time.Time) *source.Future[source.NodeActiveMinutesResult] {
  756. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  757. const activeMinsQuery = `avg(node_total_hourly_cost{%s}) by (node, %s, provider_id)[%s:%dm]`
  758. cfg := pds.promConfig
  759. minsPerResolution := cfg.DataResolutionMinutes
  760. durStr := timeutil.DurationString(end.Sub(start))
  761. if durStr == "" {
  762. panic("failed to parse duration string passed to QueryNodeActiveMinutes")
  763. }
  764. queryActiveMins := fmt.Sprintf(activeMinsQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  765. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  766. return source.NewFuture(source.DecodeNodeActiveMinutesResult, ctx.QueryAtTime(queryActiveMins, end))
  767. }
  768. func (pds *PrometheusDataSource) QueryNodeCPUModeTotal(start, end time.Time) *source.Future[source.NodeCPUModeTotalResult] {
  769. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel())
  770. const nodeCPUModeTotalQuery = `sum(rate(node_cpu_seconds_total{%s}[%s:%dm])) by (kubernetes_node, %s, mode)`
  771. cfg := pds.promConfig
  772. minsPerResolution := cfg.DataResolutionMinutes
  773. durStr := timeutil.DurationString(end.Sub(start))
  774. if durStr == "" {
  775. panic("failed to parse duration string passed to QueryNodeCPUModeTotal")
  776. }
  777. queryCPUModeTotal := fmt.Sprintf(nodeCPUModeTotalQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  778. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  779. return source.NewFuture(source.DecodeNodeCPUModeTotalResult, ctx.QueryAtTime(queryCPUModeTotal, end))
  780. }
  781. func (pds *PrometheusDataSource) QueryNodeCPUModePercent(start, end time.Time) *source.Future[source.NodeCPUModePercentResult] {
  782. const fmtQueryCPUModePct = `
  783. sum(rate(node_cpu_seconds_total{%s}[%s])) by (%s, mode) / ignoring(mode)
  784. group_left sum(rate(node_cpu_seconds_total{%s}[%s])) by (%s)
  785. `
  786. // env.GetPromClusterFilter(), windowStr, env.GetPromClusterLabel(), env.GetPromClusterFilter(), windowStr, fmtOffset, env.GetPromClusterLabel()
  787. cfg := pds.promConfig
  788. durStr := timeutil.DurationString(end.Sub(start))
  789. if durStr == "" {
  790. panic("failed to parse duration string passed to QueryNodeCPUModePercent")
  791. }
  792. queryCPUModePct := fmt.Sprintf(fmtQueryCPUModePct, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  793. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  794. return source.NewFuture(source.DecodeNodeCPUModePercentResult, ctx.QueryAtTime(queryCPUModePct, end))
  795. }
  796. func (pds *PrometheusDataSource) QueryNodeRAMSystemPercent(start, end time.Time) *source.Future[source.NodeRAMSystemPercentResult] {
  797. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  798. const nodeRAMSystemPctQuery = `sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`
  799. cfg := pds.promConfig
  800. minsPerResolution := cfg.DataResolutionMinutes
  801. durStr := timeutil.DurationString(end.Sub(start))
  802. if durStr == "" {
  803. panic("failed to parse duration string passed to QueryNodeRAMSystemPercent")
  804. }
  805. queryRAMSystemPct := fmt.Sprintf(nodeRAMSystemPctQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterLabel)
  806. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  807. return source.NewFuture(source.DecodeNodeRAMSystemPercentResult, ctx.QueryAtTime(queryRAMSystemPct, end))
  808. }
  809. func (pds *PrometheusDataSource) QueryNodeRAMUserPercent(start, end time.Time) *source.Future[source.NodeRAMUserPercentResult] {
  810. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  811. const nodeRAMUserPctQuery = `sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`
  812. cfg := pds.promConfig
  813. minsPerResolution := cfg.DataResolutionMinutes
  814. durStr := timeutil.DurationString(end.Sub(start))
  815. if durStr == "" {
  816. panic("failed to parse duration string passed to QueryNodeRAMUserPercent")
  817. }
  818. queryRAMUserPct := fmt.Sprintf(nodeRAMUserPctQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterLabel)
  819. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  820. return source.NewFuture(source.DecodeNodeRAMUserPercentResult, ctx.QueryAtTime(queryRAMUserPct, end))
  821. }
  822. func (pds *PrometheusDataSource) QueryLBPricePerHr(start, end time.Time) *source.Future[source.LBPricePerHrResult] {
  823. const queryFmtLBCostPerHr = `avg(avg_over_time(kubecost_load_balancer_cost{%s}[%s])) by (namespace, service_name, ingress_ip, %s)`
  824. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  825. cfg := pds.promConfig
  826. durStr := timeutil.DurationString(end.Sub(start))
  827. if durStr == "" {
  828. panic("failed to parse duration string passed to QueryLBPricePerHr")
  829. }
  830. queryLBCostPerHr := fmt.Sprintf(queryFmtLBCostPerHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  831. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  832. return source.NewFuture(source.DecodeLBPricePerHrResult, ctx.QueryAtTime(queryLBCostPerHr, end))
  833. }
  834. func (pds *PrometheusDataSource) QueryLBActiveMinutes(start, end time.Time) *source.Future[source.LBActiveMinutesResult] {
  835. const lbActiveMinutesQuery = `avg(kubecost_load_balancer_cost{%s}) by (namespace, service_name, %s, ingress_ip)[%s:%dm]`
  836. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  837. cfg := pds.promConfig
  838. minsPerResolution := cfg.DataResolutionMinutes
  839. durStr := timeutil.DurationString(end.Sub(start))
  840. if durStr == "" {
  841. panic("failed to parse duration string passed to QueryLBActiveMinutes")
  842. }
  843. queryLBActiveMins := fmt.Sprintf(lbActiveMinutesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  844. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  845. return source.NewFuture(source.DecodeLBActiveMinutesResult, ctx.QueryAtTime(queryLBActiveMins, end))
  846. }
  847. func (pds *PrometheusDataSource) QueryClusterManagementDuration(start, end time.Time) *source.Future[source.ClusterManagementDurationResult] {
  848. const clusterManagementDurationQuery = `avg(kubecost_cluster_management_cost{%s}) by (%s, provisioner_name)[%s:%dm]`
  849. cfg := pds.promConfig
  850. minsPerResolution := cfg.DataResolutionMinutes
  851. durStr := timeutil.DurationString(end.Sub(start))
  852. if durStr == "" {
  853. panic("failed to parse duration string passed to QueryClusterManagementDuration")
  854. }
  855. queryClusterManagementDuration := fmt.Sprintf(clusterManagementDurationQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  856. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  857. return source.NewFuture(source.DecodeClusterManagementDurationResult, ctx.QueryAtTime(queryClusterManagementDuration, end))
  858. }
  859. func (pds *PrometheusDataSource) QueryClusterManagementPricePerHr(start, end time.Time) *source.Future[source.ClusterManagementPricePerHrResult] {
  860. const clusterManagementCostQuery = `avg(avg_over_time(kubecost_cluster_management_cost{%s}[%s])) by (%s, provisioner_name)`
  861. // env.GetPromClusterFilter(), durationStr, env.GetPromClusterLabel()
  862. cfg := pds.promConfig
  863. durStr := timeutil.DurationString(end.Sub(start))
  864. if durStr == "" {
  865. panic("failed to parse duration string passed to QueryClusterManagementCost")
  866. }
  867. queryClusterManagementCost := fmt.Sprintf(clusterManagementCostQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  868. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  869. return source.NewFuture(source.DecodeClusterManagementPricePerHrResult, ctx.QueryAtTime(queryClusterManagementCost, end))
  870. }
  871. func (pds *PrometheusDataSource) QueryDataCount(start, end time.Time) *source.Future[source.DataCountResult] {
  872. const fmtQueryDataCount = `
  873. count_over_time(sum(kube_node_status_capacity_cpu_cores{%s}) by (%s)[%s:%dm]) * %d
  874. `
  875. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, minsPerResolution)
  876. cfg := pds.promConfig
  877. minsPerResolution := cfg.DataResolutionMinutes
  878. durStr := timeutil.DurationString(end.Sub(start))
  879. if durStr == "" {
  880. panic("failed to parse duration string passed to QueryDataCount")
  881. }
  882. queryDataCount := fmt.Sprintf(fmtQueryDataCount, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, minsPerResolution)
  883. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  884. return source.NewFuture(source.DecodeDataCountResult, ctx.QueryAtTime(queryDataCount, end))
  885. }
  886. func (pds *PrometheusDataSource) QueryTotalGPU(start, end time.Time) *source.Future[source.TotalGPUResult] {
  887. const fmtQueryTotalGPU = `
  888. sum(
  889. sum_over_time(node_gpu_hourly_cost{%s}[%s:%dm]) * %f
  890. ) by (%s)
  891. `
  892. // env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, hourlyToCumulative, env.GetPromClusterLabel())
  893. cfg := pds.promConfig
  894. minsPerResolution := cfg.DataResolutionMinutes
  895. durStr := timeutil.DurationString(end.Sub(start))
  896. if durStr == "" {
  897. panic("failed to parse duration string passed to QueryTotalGPU")
  898. }
  899. // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
  900. // value, converts it to a cumulative value; i.e.
  901. // [$/hr] * [min/res]*[hr/min] = [$/res]
  902. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  903. queryTotalGPU := fmt.Sprintf(fmtQueryTotalGPU, cfg.ClusterFilter, durStr, minsPerResolution, hourlyToCumulative, cfg.ClusterLabel)
  904. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  905. return source.NewFuture(source.DecodeTotalGPUResult, ctx.QueryAtTime(queryTotalGPU, end))
  906. }
  907. func (pds *PrometheusDataSource) QueryTotalCPU(start, end time.Time) *source.Future[source.TotalCPUResult] {
  908. const fmtQueryTotalCPU = `
  909. sum(
  910. sum_over_time(avg(kube_node_status_capacity_cpu_cores{%s}) by (node, %s)[%s:%dm]) *
  911. avg(avg_over_time(node_cpu_hourly_cost{%s}[%s:%dm])) by (node, %s) * %f
  912. ) by (%s)
  913. `
  914. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel()
  915. cfg := pds.promConfig
  916. minsPerResolution := cfg.DataResolutionMinutes
  917. durStr := timeutil.DurationString(end.Sub(start))
  918. if durStr == "" {
  919. panic("failed to parse duration string passed to QueryTotalCPU")
  920. }
  921. // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
  922. // value, converts it to a cumulative value; i.e.
  923. // [$/hr] * [min/res]*[hr/min] = [$/res]
  924. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  925. queryTotalCPU := fmt.Sprintf(fmtQueryTotalCPU, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, hourlyToCumulative, cfg.ClusterLabel)
  926. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  927. return source.NewFuture(source.DecodeTotalCPUResult, ctx.QueryAtTime(queryTotalCPU, end))
  928. }
  929. func (pds *PrometheusDataSource) QueryTotalRAM(start, end time.Time) *source.Future[source.TotalRAMResult] {
  930. const fmtQueryTotalRAM = `
  931. sum(
  932. sum_over_time(avg(kube_node_status_capacity_memory_bytes{%s}) by (node, %s)[%s:%dm]) / 1024 / 1024 / 1024 *
  933. avg(avg_over_time(node_ram_hourly_cost{%s}[%s:%dm])) by (node, %s) * %f
  934. ) by (%s)
  935. `
  936. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, env.GetPromClusterFilter(), windowStr, minsPerResolution, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
  937. cfg := pds.promConfig
  938. minsPerResolution := cfg.DataResolutionMinutes
  939. durStr := timeutil.DurationString(end.Sub(start))
  940. if durStr == "" {
  941. panic("failed to parse duration string passed to QueryTotalRAM")
  942. }
  943. // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
  944. // value, converts it to a cumulative value; i.e.
  945. // [$/hr] * [min/res]*[hr/min] = [$/res]
  946. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  947. queryTotalRAM := fmt.Sprintf(fmtQueryTotalRAM, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, hourlyToCumulative, cfg.ClusterLabel)
  948. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  949. return source.NewFuture(source.DecodeTotalRAMResult, ctx.QueryAtTime(queryTotalRAM, end))
  950. }
  951. func (pds *PrometheusDataSource) QueryTotalStorage(start, end time.Time) *source.Future[source.TotalStorageResult] {
  952. const fmtQueryTotalStorage = `
  953. sum(
  954. sum_over_time(avg(kube_persistentvolume_capacity_bytes{%s}) by (persistentvolume, %s)[%s:%dm]) / 1024 / 1024 / 1024 *
  955. avg(avg_over_time(pv_hourly_cost{%s}[%s:%dm])) by (persistentvolume, %s) * %f
  956. ) by (%s)
  957. `
  958. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, env.GetPromClusterFilter(), windowStr, minsPerResolution, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
  959. cfg := pds.promConfig
  960. minsPerResolution := cfg.DataResolutionMinutes
  961. durStr := timeutil.DurationString(end.Sub(start))
  962. if durStr == "" {
  963. panic("failed to parse duration string passed to QueryTotalStorage")
  964. }
  965. // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
  966. // value, converts it to a cumulative value; i.e.
  967. // [$/hr] * [min/res]*[hr/min] = [$/res]
  968. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  969. queryTotalStorage := fmt.Sprintf(fmtQueryTotalStorage, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, hourlyToCumulative, cfg.ClusterLabel)
  970. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  971. return source.NewFuture(source.DecodeTotalStorageResult, ctx.QueryAtTime(queryTotalStorage, end))
  972. }
  973. func (pds *PrometheusDataSource) QueryClusterCores(start, end time.Time, step time.Duration) *source.Future[source.ClusterCoresResult] {
  974. const queryClusterCores = `sum(
  975. avg(avg_over_time(kube_node_status_capacity_cpu_cores{%s}[%s])) by (node, %s) * avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s) * 730 +
  976. avg(avg_over_time(node_gpu_hourly_cost{%s}[%s])) by (node, %s) * 730
  977. ) by (%s)`
  978. // env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  979. cfg := pds.promConfig
  980. durStr := timeutil.DurationString(step)
  981. if durStr == "" {
  982. panic("failed to parse duration string passed to QueryClusterCores")
  983. }
  984. clusterCoresQuery := fmt.Sprintf(queryClusterCores, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  985. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  986. return source.NewFuture(source.DecodeClusterCoresResult, ctx.QueryRange(clusterCoresQuery, start, end, step))
  987. }
  988. func (pds *PrometheusDataSource) QueryClusterRAM(start, end time.Time, step time.Duration) *source.Future[source.ClusterRAMResult] {
  989. const queryClusterRAM = `sum(
  990. avg(avg_over_time(kube_node_status_capacity_memory_bytes{%s}[%s])) by (node, %s) / 1024 / 1024 / 1024 * avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s) * 730
  991. ) by (%s)`
  992. // env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  993. cfg := pds.promConfig
  994. durStr := timeutil.DurationString(step)
  995. if durStr == "" {
  996. panic("failed to parse duration string passed to QueryClusterRAM")
  997. }
  998. clusterRAMQuery := fmt.Sprintf(queryClusterRAM, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  999. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1000. return source.NewFuture(source.DecodeClusterRAMResult, ctx.QueryRange(clusterRAMQuery, start, end, step))
  1001. }
  1002. func (pds *PrometheusDataSource) QueryClusterStorage(start, end time.Time, step time.Duration) *source.Future[source.ClusterStorageResult] {
  1003. return pds.QueryClusterStorageByProvider("", start, end, step)
  1004. }
  1005. func (pds *PrometheusDataSource) QueryClusterStorageByProvider(provider string, start, end time.Time, step time.Duration) *source.Future[source.ClusterStorageResult] {
  1006. const queryStorage = `sum(
  1007. avg(avg_over_time(pv_hourly_cost{%s}[%s])) by (persistentvolume, %s) * 730
  1008. * avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[%s])) by (persistentvolume, %s) / 1024 / 1024 / 1024
  1009. ) by (%s) %s`
  1010. // env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterLabel(), localStorageQuery)
  1011. var localStorageQuery string
  1012. if provider != "" {
  1013. key := strings.ToLower(provider)
  1014. if f, ok := providerStorageQueries[key]; ok {
  1015. localStorageQuery = f(pds.promConfig, start, end, true, false)
  1016. } else {
  1017. localStorageQuery = ""
  1018. }
  1019. }
  1020. if localStorageQuery != "" {
  1021. localStorageQuery = fmt.Sprintf(" + %s", localStorageQuery)
  1022. }
  1023. cfg := pds.promConfig
  1024. durStr := timeutil.DurationString(step)
  1025. if durStr == "" {
  1026. panic("failed to parse duration string passed to QueryClusterCores")
  1027. }
  1028. clusterStorageQuery := fmt.Sprintf(queryStorage, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel, localStorageQuery)
  1029. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1030. return source.NewFuture(source.DecodeClusterStorageResult, ctx.QueryRange(clusterStorageQuery, start, end, step))
  1031. }
  1032. func (pds *PrometheusDataSource) QueryClusterTotal(start, end time.Time, step time.Duration) *source.Future[source.ClusterTotalResult] {
  1033. return pds.QueryClusterTotalByProvider("", start, end, step)
  1034. }
  1035. func (pds *PrometheusDataSource) QueryClusterTotalByProvider(provider string, start, end time.Time, step time.Duration) *source.Future[source.ClusterTotalResult] {
  1036. const queryTotal = `sum(avg(node_total_hourly_cost{%s}) by (node, %s)) * 730 +
  1037. sum(
  1038. avg(avg_over_time(pv_hourly_cost{%s}[1h])) by (persistentvolume, %s) * 730
  1039. * avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[1h])) by (persistentvolume, %s) / 1024 / 1024 / 1024
  1040. ) by (%s) %s`
  1041. var localStorageQuery string
  1042. if provider != "" {
  1043. key := strings.ToLower(provider)
  1044. if f, ok := providerStorageQueries[key]; ok {
  1045. localStorageQuery = f(pds.promConfig, start, end, true, false)
  1046. } else {
  1047. localStorageQuery = ""
  1048. }
  1049. }
  1050. if localStorageQuery != "" {
  1051. localStorageQuery = fmt.Sprintf(" + %s", localStorageQuery)
  1052. }
  1053. cfg := pds.promConfig
  1054. durStr := timeutil.DurationString(step)
  1055. if durStr == "" {
  1056. panic("failed to parse duration string passed to QueryClusterTotalByProvider")
  1057. }
  1058. clusterTotalQuery := fmt.Sprintf(queryTotal, cfg.ClusterFilter, cfg.ClusterLabel, cfg.ClusterFilter, cfg.ClusterLabel, cfg.ClusterFilter, cfg.ClusterLabel, cfg.ClusterLabel, localStorageQuery)
  1059. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1060. return source.NewFuture(source.DecodeClusterTotalResult, ctx.QueryRange(clusterTotalQuery, start, end, step))
  1061. }
  1062. func (pds *PrometheusDataSource) QueryClusterNodes(start, end time.Time, step time.Duration) *source.Future[source.ClusterNodesResult] {
  1063. return pds.QueryClusterNodesByProvider("", start, end, step)
  1064. }
  1065. func (pds *PrometheusDataSource) QueryClusterNodesByProvider(provider string, start, end time.Time, step time.Duration) *source.Future[source.ClusterNodesResult] {
  1066. const queryNodes = `sum(avg(node_total_hourly_cost{%s}) by (node, %s)) * 730 %s`
  1067. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), localStorageQuery)
  1068. var localStorageQuery string
  1069. if provider != "" {
  1070. key := strings.ToLower(provider)
  1071. if f, ok := providerStorageQueries[key]; ok {
  1072. localStorageQuery = f(pds.promConfig, start, end, true, false)
  1073. } else {
  1074. localStorageQuery = ""
  1075. }
  1076. }
  1077. if localStorageQuery != "" {
  1078. localStorageQuery = fmt.Sprintf(" + %s", localStorageQuery)
  1079. }
  1080. cfg := pds.promConfig
  1081. durStr := timeutil.DurationString(step)
  1082. if durStr == "" {
  1083. panic("failed to parse duration string passed to QueryClusterNodesByProvider")
  1084. }
  1085. clusterNodesCostQuery := fmt.Sprintf(queryNodes, cfg.ClusterFilter, cfg.ClusterLabel, localStorageQuery)
  1086. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1087. return source.NewFuture(source.DecodeClusterNodesResult, ctx.QueryRange(clusterNodesCostQuery, start, end, step))
  1088. }
  1089. // AllocationMetricQuerier
  1090. func (pds *PrometheusDataSource) QueryPods(start, end time.Time) *source.Future[source.PodsResult] {
  1091. const queryFmtPods = `avg(kube_pod_container_status_running{%s} != 0) by (pod, namespace, %s)[%s:%s]`
  1092. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  1093. cfg := pds.promConfig
  1094. resolution := cfg.DataResolution
  1095. resStr := timeutil.DurationString(resolution)
  1096. durStr := timeutil.DurationString(end.Sub(start))
  1097. if durStr == "" {
  1098. panic("failed to parse duration string passed to QueryPods")
  1099. }
  1100. queryPods := fmt.Sprintf(queryFmtPods, cfg.ClusterFilter, cfg.ClusterLabel, durStr, resStr)
  1101. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1102. return source.NewFuture(source.DecodePodsResult, ctx.QueryAtTime(queryPods, end))
  1103. }
  1104. func (pds *PrometheusDataSource) QueryPodsUID(start, end time.Time) *source.Future[source.PodsResult] {
  1105. const queryFmtPodsUID = `avg(kube_pod_container_status_running{%s} != 0) by (pod, namespace, uid, %s)[%s:%s]`
  1106. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, resStr)
  1107. cfg := pds.promConfig
  1108. resolution := cfg.DataResolution
  1109. resStr := timeutil.DurationString(resolution)
  1110. durStr := timeutil.DurationString(end.Sub(start))
  1111. if durStr == "" {
  1112. panic("failed to parse duration string passed to QueryPodsUID")
  1113. }
  1114. queryPodsUID := fmt.Sprintf(queryFmtPodsUID, cfg.ClusterFilter, cfg.ClusterLabel, durStr, resStr)
  1115. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1116. return source.NewFuture(source.DecodePodsResult, ctx.QueryAtTime(queryPodsUID, end))
  1117. }
  1118. func (pds *PrometheusDataSource) QueryRAMBytesAllocated(start, end time.Time) *source.Future[source.RAMBytesAllocatedResult] {
  1119. const queryFmtRAMBytesAllocated = `avg(avg_over_time(container_memory_allocation_bytes{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s, provider_id)`
  1120. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1121. cfg := pds.promConfig
  1122. durStr := timeutil.DurationString(end.Sub(start))
  1123. if durStr == "" {
  1124. panic("failed to parse duration string passed to QueryRAMBytesAllocated")
  1125. }
  1126. queryRAMBytesAllocated := fmt.Sprintf(queryFmtRAMBytesAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1127. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1128. return source.NewFuture(source.DecodeRAMBytesAllocatedResult, ctx.QueryAtTime(queryRAMBytesAllocated, end))
  1129. }
  1130. func (pds *PrometheusDataSource) QueryRAMRequests(start, end time.Time) *source.Future[source.RAMRequestsResult] {
  1131. const queryFmtRAMRequests = `avg(avg_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  1132. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1133. cfg := pds.promConfig
  1134. durStr := timeutil.DurationString(end.Sub(start))
  1135. if durStr == "" {
  1136. panic("failed to parse duration string passed to QueryRAMRequests")
  1137. }
  1138. queryRAMRequests := fmt.Sprintf(queryFmtRAMRequests, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1139. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1140. return source.NewFuture(source.DecodeRAMRequestsResult, ctx.QueryAtTime(queryRAMRequests, end))
  1141. }
  1142. func (pds *PrometheusDataSource) QueryRAMUsageAvg(start, end time.Time) *source.Future[source.RAMUsageAvgResult] {
  1143. const queryFmtRAMUsageAvg = `avg(avg_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
  1144. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1145. cfg := pds.promConfig
  1146. durStr := timeutil.DurationString(end.Sub(start))
  1147. if durStr == "" {
  1148. panic("failed to parse duration string passed to QueryRAMUsageAvg")
  1149. }
  1150. queryRAMUsageAvg := fmt.Sprintf(queryFmtRAMUsageAvg, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1151. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1152. return source.NewFuture(source.DecodeRAMUsageAvgResult, ctx.QueryAtTime(queryRAMUsageAvg, end))
  1153. }
  1154. func (pds *PrometheusDataSource) QueryRAMUsageMax(start, end time.Time) *source.Future[source.RAMUsageMaxResult] {
  1155. const queryFmtRAMUsageMax = `max(max_over_time(container_memory_working_set_bytes{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
  1156. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1157. cfg := pds.promConfig
  1158. durStr := timeutil.DurationString(end.Sub(start))
  1159. if durStr == "" {
  1160. panic("failed to parse duration string passed to QueryRAMUsageMax")
  1161. }
  1162. queryRAMUsageMax := fmt.Sprintf(queryFmtRAMUsageMax, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1163. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1164. return source.NewFuture(source.DecodeRAMUsageMaxResult, ctx.QueryAtTime(queryRAMUsageMax, end))
  1165. }
  1166. func (pds *PrometheusDataSource) QueryCPUCoresAllocated(start, end time.Time) *source.Future[source.CPUCoresAllocatedResult] {
  1167. const queryFmtCPUCoresAllocated = `avg(avg_over_time(container_cpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  1168. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1169. cfg := pds.promConfig
  1170. durStr := timeutil.DurationString(end.Sub(start))
  1171. if durStr == "" {
  1172. panic("failed to parse duration string passed to QueryCPUCoresAllocated")
  1173. }
  1174. queryCPUCoresAllocated := fmt.Sprintf(queryFmtCPUCoresAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1175. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1176. return source.NewFuture(source.DecodeCPUCoresAllocatedResult, ctx.QueryAtTime(queryCPUCoresAllocated, end))
  1177. }
  1178. func (pds *PrometheusDataSource) QueryCPURequests(start, end time.Time) *source.Future[source.CPURequestsResult] {
  1179. const queryFmtCPURequests = `avg(avg_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  1180. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1181. cfg := pds.promConfig
  1182. durStr := timeutil.DurationString(end.Sub(start))
  1183. if durStr == "" {
  1184. panic("failed to parse duration string passed to QueryCPURequests")
  1185. }
  1186. queryCPURequests := fmt.Sprintf(queryFmtCPURequests, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1187. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1188. return source.NewFuture(source.DecodeCPURequestsResult, ctx.QueryAtTime(queryCPURequests, end))
  1189. }
  1190. func (pds *PrometheusDataSource) QueryCPUUsageAvg(start, end time.Time) *source.Future[source.CPUUsageAvgResult] {
  1191. const queryFmtCPUUsageAvg = `avg(rate(container_cpu_usage_seconds_total{container!="", container_name!="POD", container!="POD", %s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
  1192. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1193. cfg := pds.promConfig
  1194. durStr := timeutil.DurationString(end.Sub(start))
  1195. if durStr == "" {
  1196. panic("failed to parse duration string passed to QueryCPUUsageAvg")
  1197. }
  1198. queryCPUUsageAvg := fmt.Sprintf(queryFmtCPUUsageAvg, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1199. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1200. return source.NewFuture(source.DecodeCPUUsageAvgResult, ctx.QueryAtTime(queryCPUUsageAvg, end))
  1201. }
  1202. func (pds *PrometheusDataSource) QueryCPUUsageMax(start, end time.Time) *source.Future[source.CPUUsageMaxResult] {
  1203. // Because we use container_cpu_usage_seconds_total to calculate CPU usage
  1204. // at any given "instant" of time, we need to use an irate or rate. To then
  1205. // calculate a max (or any aggregation) we have to perform an aggregation
  1206. // query on top of an instant-by-instant maximum. Prometheus supports this
  1207. // type of query with a "subquery" [1], however it is reportedly expensive
  1208. // to make such a query. By default, Kubecost's Prometheus config includes
  1209. // a recording rule that keeps track of the instant-by-instant irate for CPU
  1210. // usage. The metric in this query is created by that recording rule.
  1211. //
  1212. // [1] https://prometheus.io/blog/2019/01/28/subquery-support/
  1213. //
  1214. // If changing the name of the recording rule, make sure to update the
  1215. // corresponding diagnostic query to avoid confusion.
  1216. const queryFmtCPUUsageMaxRecordingRule = `max(max_over_time(kubecost_container_cpu_usage_irate{%s}[%s])) by (container_name, container, pod_name, pod, namespace, instance, %s)`
  1217. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1218. // This is the subquery equivalent of the above recording rule query. It is
  1219. // more expensive, but does not require the recording rule. It should be
  1220. // used as a fallback query if the recording rule data does not exist.
  1221. //
  1222. // The parameter after the colon [:<thisone>] in the subquery affects the
  1223. // resolution of the subquery.
  1224. // The parameter after the metric ...{}[<thisone>] should be set to 2x
  1225. // the resolution, to make sure the irate always has two points to query
  1226. // in case the Prom scrape duration has been reduced to be equal to the
  1227. // ETL resolution.
  1228. const queryFmtCPUUsageMaxSubquery = `max(max_over_time(irate(container_cpu_usage_seconds_total{container!="POD", container!="", %s}[%s])[%s:%s])) by (container, pod_name, pod, namespace, instance, %s)`
  1229. // env.GetPromClusterFilter(), doubleResStr, durStr, resStr, env.GetPromClusterLabel()
  1230. cfg := pds.promConfig
  1231. durStr := timeutil.DurationString(end.Sub(start))
  1232. if durStr == "" {
  1233. panic("failed to parse duration string passed to QueryCPUUsageMax")
  1234. }
  1235. queryCPUUsageMaxRecordingRule := fmt.Sprintf(queryFmtCPUUsageMaxRecordingRule, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1236. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1237. resCPUUsageMaxRR := ctx.QueryAtTime(queryCPUUsageMaxRecordingRule, end)
  1238. resCPUUsageMax, _ := resCPUUsageMaxRR.Await()
  1239. if len(resCPUUsageMax) > 0 {
  1240. return wrapResults(queryCPUUsageMaxRecordingRule, source.DecodeCPUUsageMaxResult, resCPUUsageMax)
  1241. }
  1242. resolution := cfg.DataResolution
  1243. resStr := timeutil.DurationString(resolution)
  1244. doubleResStr := timeutil.DurationString(2 * resolution)
  1245. queryCPUUsageMaxSubquery := fmt.Sprintf(queryFmtCPUUsageMaxSubquery, cfg.ClusterFilter, doubleResStr, durStr, resStr, cfg.ClusterLabel)
  1246. return source.NewFuture(source.DecodeCPUUsageMaxResult, ctx.QueryAtTime(queryCPUUsageMaxSubquery, end))
  1247. }
  1248. func (pds *PrometheusDataSource) QueryGPUsRequested(start, end time.Time) *source.Future[source.GPUsRequestedResult] {
  1249. const queryFmtGPUsRequested = `avg(avg_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  1250. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1251. cfg := pds.promConfig
  1252. durStr := timeutil.DurationString(end.Sub(start))
  1253. if durStr == "" {
  1254. panic("failed to parse duration string passed to QueryGPUsRequested")
  1255. }
  1256. queryGPUsRequested := fmt.Sprintf(queryFmtGPUsRequested, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1257. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1258. return source.NewFuture(source.DecodeGPUsRequestedResult, ctx.QueryAtTime(queryGPUsRequested, end))
  1259. }
  1260. func (pds *PrometheusDataSource) QueryGPUsUsageAvg(start, end time.Time) *source.Future[source.GPUsUsageAvgResult] {
  1261. const queryFmtGPUsUsageAvg = `avg(avg_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{container!=""}[%s])) by (container, pod, namespace, %s)`
  1262. // durStr, env.GetPromClusterLabel()
  1263. cfg := pds.promConfig
  1264. durStr := timeutil.DurationString(end.Sub(start))
  1265. if durStr == "" {
  1266. panic("failed to parse duration string passed to QueryGPUsUsageAvg")
  1267. }
  1268. queryGPUsUsageAvg := fmt.Sprintf(queryFmtGPUsUsageAvg, durStr, cfg.ClusterLabel)
  1269. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1270. return source.NewFuture(source.DecodeGPUsUsageAvgResult, ctx.QueryAtTime(queryGPUsUsageAvg, end))
  1271. }
  1272. func (pds *PrometheusDataSource) QueryGPUsUsageMax(start, end time.Time) *source.Future[source.GPUsUsageMaxResult] {
  1273. const queryFmtGPUsUsageMax = `max(max_over_time(DCGM_FI_PROF_GR_ENGINE_ACTIVE{container!=""}[%s])) by (container, pod, namespace, %s)`
  1274. // durStr, env.GetPromClusterLabel()
  1275. cfg := pds.promConfig
  1276. durStr := timeutil.DurationString(end.Sub(start))
  1277. if durStr == "" {
  1278. panic("failed to parse duration string passed to QueryGPUsUsageMax")
  1279. }
  1280. queryGPUsUsageMax := fmt.Sprintf(queryFmtGPUsUsageMax, durStr, cfg.ClusterLabel)
  1281. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1282. return source.NewFuture(source.DecodeGPUsUsageMaxResult, ctx.QueryAtTime(queryGPUsUsageMax, end))
  1283. }
  1284. func (pds *PrometheusDataSource) QueryGPUsAllocated(start, end time.Time) *source.Future[source.GPUsAllocatedResult] {
  1285. const queryFmtGPUsAllocated = `avg(avg_over_time(container_gpu_allocation{container!="", container!="POD", node!="", %s}[%s])) by (container, pod, namespace, node, %s)`
  1286. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1287. cfg := pds.promConfig
  1288. durStr := timeutil.DurationString(end.Sub(start))
  1289. if durStr == "" {
  1290. panic("failed to parse duration string passed to QueryGPUsAllocated")
  1291. }
  1292. queryGPUsAllocated := fmt.Sprintf(queryFmtGPUsAllocated, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1293. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1294. return source.NewFuture(source.DecodeGPUsAllocatedResult, ctx.QueryAtTime(queryGPUsAllocated, end))
  1295. }
  1296. func (pds *PrometheusDataSource) QueryIsGPUShared(start, end time.Time) *source.Future[source.IsGPUSharedResult] {
  1297. const queryFmtIsGPUShared = `avg(avg_over_time(kube_pod_container_resource_requests{container!="", node != "", pod != "", container!= "", unit = "integer", %s}[%s])) by (container, pod, namespace, node, resource)`
  1298. // env.GetPromClusterFilter(), durStr
  1299. cfg := pds.promConfig
  1300. durStr := timeutil.DurationString(end.Sub(start))
  1301. if durStr == "" {
  1302. panic("failed to parse duration string passed to QueryIsGPUShared")
  1303. }
  1304. queryIsGPUShared := fmt.Sprintf(queryFmtIsGPUShared, cfg.ClusterFilter, durStr)
  1305. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1306. return source.NewFuture(source.DecodeIsGPUSharedResult, ctx.QueryAtTime(queryIsGPUShared, end))
  1307. }
  1308. func (pds *PrometheusDataSource) QueryGPUInfo(start, end time.Time) *source.Future[source.GPUInfoResult] {
  1309. const queryFmtGetGPUInfo = `avg(avg_over_time(DCGM_FI_DEV_DEC_UTIL{container!="",%s}[%s])) by (container, pod, namespace, device, modelName, UUID)`
  1310. // env.GetPromClusterFilter(), durStr
  1311. cfg := pds.promConfig
  1312. durStr := timeutil.DurationString(end.Sub(start))
  1313. if durStr == "" {
  1314. panic("failed to parse duration string passed to QueryGPUInfo")
  1315. }
  1316. queryGetGPUInfo := fmt.Sprintf(queryFmtGetGPUInfo, cfg.ClusterFilter, durStr)
  1317. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1318. return source.NewFuture(source.DecodeGPUInfoResult, ctx.QueryAtTime(queryGetGPUInfo, end))
  1319. }
  1320. func (pds *PrometheusDataSource) QueryNodeCPUPricePerHr(start, end time.Time) *source.Future[source.NodeCPUPricePerHrResult] {
  1321. const queryFmtNodeCostPerCPUHr = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  1322. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1323. cfg := pds.promConfig
  1324. durStr := timeutil.DurationString(end.Sub(start))
  1325. if durStr == "" {
  1326. panic("failed to parse duration string passed to QueryNodeCPUPricePerHr")
  1327. }
  1328. queryNodeCostPerCPUHr := fmt.Sprintf(queryFmtNodeCostPerCPUHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1329. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1330. return source.NewFuture(source.DecodeNodeCPUPricePerHrResult, ctx.QueryAtTime(queryNodeCostPerCPUHr, end))
  1331. }
  1332. func (pds *PrometheusDataSource) QueryNodeRAMPricePerGiBHr(start, end time.Time) *source.Future[source.NodeRAMPricePerGiBHrResult] {
  1333. const queryFmtNodeCostPerRAMGiBHr = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  1334. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1335. cfg := pds.promConfig
  1336. durStr := timeutil.DurationString(end.Sub(start))
  1337. if durStr == "" {
  1338. panic("failed to parse duration string passed to QueryNodeRAMPricePerGiBHr")
  1339. }
  1340. queryNodeCostPerRAMGiBHr := fmt.Sprintf(queryFmtNodeCostPerRAMGiBHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1341. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1342. return source.NewFuture(source.DecodeNodeRAMPricePerGiBHrResult, ctx.QueryAtTime(queryNodeCostPerRAMGiBHr, end))
  1343. }
  1344. func (pds *PrometheusDataSource) QueryNodeGPUPricePerHr(start, end time.Time) *source.Future[source.NodeGPUPricePerHrResult] {
  1345. const queryFmtNodeCostPerGPUHr = `avg(avg_over_time(node_gpu_hourly_cost{%s}[%s])) by (node, %s, instance_type, provider_id)`
  1346. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1347. cfg := pds.promConfig
  1348. durStr := timeutil.DurationString(end.Sub(start))
  1349. if durStr == "" {
  1350. panic("failed to parse duration string passed to QueryNodeGPUPricePerHr")
  1351. }
  1352. queryNodeCostPerGPUHr := fmt.Sprintf(queryFmtNodeCostPerGPUHr, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1353. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1354. return source.NewFuture(source.DecodeNodeGPUPricePerHrResult, ctx.QueryAtTime(queryNodeCostPerGPUHr, end))
  1355. }
  1356. func (pds *PrometheusDataSource) QueryNodeIsSpot(start, end time.Time) *source.Future[source.NodeIsSpotResult] {
  1357. const queryFmtNodeIsSpot = `avg_over_time(kubecost_node_is_spot{%s}[%s])`
  1358. //`avg_over_time(kubecost_node_is_spot{%s}[%s:%dm])`
  1359. // env.GetPromClusterFilter(), durStr)
  1360. cfg := pds.promConfig
  1361. durStr := timeutil.DurationString(end.Sub(start))
  1362. if durStr == "" {
  1363. panic("failed to parse duration string passed to QueryNodeIsSpot2")
  1364. }
  1365. queryNodeIsSpot := fmt.Sprintf(queryFmtNodeIsSpot, cfg.ClusterFilter, durStr)
  1366. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1367. return source.NewFuture(source.DecodeNodeIsSpotResult, ctx.QueryAtTime(queryNodeIsSpot, end))
  1368. }
  1369. func (pds *PrometheusDataSource) QueryPodPVCAllocation(start, end time.Time) *source.Future[source.PodPVCAllocationResult] {
  1370. const queryFmtPodPVCAllocation = `avg(avg_over_time(pod_pvc_allocation{%s}[%s])) by (persistentvolume, persistentvolumeclaim, pod, namespace, %s)`
  1371. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1372. cfg := pds.promConfig
  1373. durStr := timeutil.DurationString(end.Sub(start))
  1374. if durStr == "" {
  1375. panic("failed to parse duration string passed to QueryPodPVCAllocation")
  1376. }
  1377. queryPodPVCAllocation := fmt.Sprintf(queryFmtPodPVCAllocation, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1378. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1379. return source.NewFuture(source.DecodePodPVCAllocationResult, ctx.QueryAtTime(queryPodPVCAllocation, end))
  1380. }
  1381. func (pds *PrometheusDataSource) QueryPVCBytesRequested(start, end time.Time) *source.Future[source.PVCBytesRequestedResult] {
  1382. const queryFmtPVCBytesRequested = `avg(avg_over_time(kube_persistentvolumeclaim_resource_requests_storage_bytes{%s}[%s])) by (persistentvolumeclaim, namespace, %s)`
  1383. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1384. cfg := pds.promConfig
  1385. durStr := timeutil.DurationString(end.Sub(start))
  1386. if durStr == "" {
  1387. panic("failed to parse duration string passed to QueryPVCBytesRequested")
  1388. }
  1389. queryPVCBytesRequested := fmt.Sprintf(queryFmtPVCBytesRequested, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1390. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1391. return source.NewFuture(source.DecodePVCBytesRequestedResult, ctx.QueryAtTime(queryPVCBytesRequested, end))
  1392. }
  1393. func (pds *PrometheusDataSource) QueryPVBytes(start, end time.Time) *source.Future[source.PVBytesResult] {
  1394. const queryFmtPVBytes = `avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[%s])) by (persistentvolume, %s)`
  1395. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1396. cfg := pds.promConfig
  1397. durStr := timeutil.DurationString(end.Sub(start))
  1398. if durStr == "" {
  1399. panic("failed to parse duration string passed to QueryPVBytes")
  1400. }
  1401. queryPVBytes := fmt.Sprintf(queryFmtPVBytes, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1402. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1403. return source.NewFuture(source.DecodePVBytesResult, ctx.QueryAtTime(queryPVBytes, end))
  1404. }
  1405. func (pds *PrometheusDataSource) QueryPVCostPerGiBHour(start, end time.Time) *source.Future[source.PVPricePerGiBHourResult] {
  1406. const queryFmtPVCostPerGiBHour = `avg(avg_over_time(pv_hourly_cost{%s}[%s])) by (volumename, %s)`
  1407. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1408. cfg := pds.promConfig
  1409. durStr := timeutil.DurationString(end.Sub(start))
  1410. if durStr == "" {
  1411. panic("failed to parse duration string passed to QueryPVCostPerGiBHour")
  1412. }
  1413. queryPVCostPerGiBHour := fmt.Sprintf(queryFmtPVCostPerGiBHour, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1414. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1415. return source.NewFuture(source.DecodePVPricePerGiBHourResult, ctx.QueryAtTime(queryPVCostPerGiBHour, end))
  1416. }
  1417. func (pds *PrometheusDataSource) QueryPVInfo(start, end time.Time) *source.Future[source.PVInfoResult] {
  1418. const queryFmtPVMeta = `avg(avg_over_time(kubecost_pv_info{%s}[%s])) by (%s, storageclass, persistentvolume, provider_id)`
  1419. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1420. cfg := pds.promConfig
  1421. durStr := timeutil.DurationString(end.Sub(start))
  1422. if durStr == "" {
  1423. panic("failed to parse duration string passed to QueryPVMeta")
  1424. }
  1425. queryPVMeta := fmt.Sprintf(queryFmtPVMeta, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1426. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1427. return source.NewFuture(source.DecodePVInfoResult, ctx.QueryAtTime(queryPVMeta, end))
  1428. }
  1429. func (pds *PrometheusDataSource) QueryNetZoneGiB(start, end time.Time) *source.Future[source.NetZoneGiBResult] {
  1430. const queryFmtNetZoneGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", same_zone="false", same_region="true", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  1431. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1432. cfg := pds.promConfig
  1433. durStr := timeutil.DurationString(end.Sub(start))
  1434. if durStr == "" {
  1435. panic("failed to parse duration string passed to QueryNetZoneGiB")
  1436. }
  1437. queryNetZoneGiB := fmt.Sprintf(queryFmtNetZoneGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1438. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1439. return source.NewFuture(source.DecodeNetZoneGiBResult, ctx.QueryAtTime(queryNetZoneGiB, end))
  1440. }
  1441. func (pds *PrometheusDataSource) QueryNetZonePricePerGiB(start, end time.Time) *source.Future[source.NetZonePricePerGiBResult] {
  1442. const queryFmtNetZoneCostPerGiB = `avg(avg_over_time(kubecost_network_zone_egress_cost{%s}[%s])) by (%s)`
  1443. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1444. cfg := pds.promConfig
  1445. durStr := timeutil.DurationString(end.Sub(start))
  1446. if durStr == "" {
  1447. panic("failed to parse duration string passed to QueryNetZonePricePerGiB")
  1448. }
  1449. queryNetZoneCostPerGiB := fmt.Sprintf(queryFmtNetZoneCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1450. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1451. return source.NewFuture(source.DecodeNetZonePricePerGiBResult, ctx.QueryAtTime(queryNetZoneCostPerGiB, end))
  1452. }
  1453. func (pds *PrometheusDataSource) QueryNetRegionGiB(start, end time.Time) *source.Future[source.NetRegionGiBResult] {
  1454. const queryFmtNetRegionGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", same_zone="false", same_region="false", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  1455. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1456. cfg := pds.promConfig
  1457. durStr := timeutil.DurationString(end.Sub(start))
  1458. if durStr == "" {
  1459. panic("failed to parse duration string passed to QueryNetRegionGiB")
  1460. }
  1461. queryNetRegionGiB := fmt.Sprintf(queryFmtNetRegionGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1462. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1463. return source.NewFuture(source.DecodeNetRegionGiBResult, ctx.QueryAtTime(queryNetRegionGiB, end))
  1464. }
  1465. func (pds *PrometheusDataSource) QueryNetRegionPricePerGiB(start, end time.Time) *source.Future[source.NetRegionPricePerGiBResult] {
  1466. const queryFmtNetRegionCostPerGiB = `avg(avg_over_time(kubecost_network_region_egress_cost{%s}[%s])) by (%s)`
  1467. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1468. cfg := pds.promConfig
  1469. durStr := timeutil.DurationString(end.Sub(start))
  1470. if durStr == "" {
  1471. panic("failed to parse duration string passed to QueryNetRegionPricePerGiB")
  1472. }
  1473. queryNetRegionCostPerGiB := fmt.Sprintf(queryFmtNetRegionCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1474. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1475. return source.NewFuture(source.DecodeNetRegionPricePerGiBResult, ctx.QueryAtTime(queryNetRegionCostPerGiB, end))
  1476. }
  1477. func (pds *PrometheusDataSource) QueryNetInternetGiB(start, end time.Time) *source.Future[source.NetInternetGiBResult] {
  1478. const queryFmtNetInternetGiB = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true", %s}[%s])) by (pod_name, namespace, %s) / 1024 / 1024 / 1024`
  1479. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1480. cfg := pds.promConfig
  1481. durStr := timeutil.DurationString(end.Sub(start))
  1482. if durStr == "" {
  1483. panic("failed to parse duration string passed to QueryNetInternetGiB")
  1484. }
  1485. queryNetInternetGiB := fmt.Sprintf(queryFmtNetInternetGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1486. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1487. return source.NewFuture(source.DecodeNetInternetGiBResult, ctx.QueryAtTime(queryNetInternetGiB, end))
  1488. }
  1489. func (pds *PrometheusDataSource) QueryNetInternetPricePerGiB(start, end time.Time) *source.Future[source.NetInternetPricePerGiBResult] {
  1490. const queryFmtNetInternetCostPerGiB = `avg(avg_over_time(kubecost_network_internet_egress_cost{%s}[%s])) by (%s)`
  1491. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1492. cfg := pds.promConfig
  1493. durStr := timeutil.DurationString(end.Sub(start))
  1494. if durStr == "" {
  1495. panic("failed to parse duration string passed to QueryNetInternetPricePerGiB")
  1496. }
  1497. queryNetInternetCostPerGiB := fmt.Sprintf(queryFmtNetInternetCostPerGiB, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1498. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1499. return source.NewFuture(source.DecodeNetInternetPricePerGiBResult, ctx.QueryAtTime(queryNetInternetCostPerGiB, end))
  1500. }
  1501. func (pds *PrometheusDataSource) QueryNetReceiveBytes(start, end time.Time) *source.Future[source.NetReceiveBytesResult] {
  1502. const queryFmtNetReceiveBytes = `sum(increase(container_network_receive_bytes_total{pod!="", %s}[%s])) by (pod_name, pod, namespace, %s)`
  1503. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1504. cfg := pds.promConfig
  1505. durStr := timeutil.DurationString(end.Sub(start))
  1506. if durStr == "" {
  1507. panic("failed to parse duration string passed to QueryNetReceiveBytes")
  1508. }
  1509. queryNetReceiveBytes := fmt.Sprintf(queryFmtNetReceiveBytes, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1510. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1511. return source.NewFuture(source.DecodeNetReceiveBytesResult, ctx.QueryAtTime(queryNetReceiveBytes, end))
  1512. }
  1513. func (pds *PrometheusDataSource) QueryNetTransferBytes(start, end time.Time) *source.Future[source.NetTransferBytesResult] {
  1514. const queryFmtNetTransferBytes = `sum(increase(container_network_transmit_bytes_total{pod!="", %s}[%s])) by (pod_name, pod, namespace, %s)`
  1515. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1516. cfg := pds.promConfig
  1517. durStr := timeutil.DurationString(end.Sub(start))
  1518. if durStr == "" {
  1519. panic("failed to parse duration string passed to QueryNetTransferBytes")
  1520. }
  1521. queryNetTransferBytes := fmt.Sprintf(queryFmtNetTransferBytes, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1522. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1523. return source.NewFuture(source.DecodeNetTransferBytesResult, ctx.QueryAtTime(queryNetTransferBytes, end))
  1524. }
  1525. func (pds *PrometheusDataSource) QueryNamespaceLabels(start, end time.Time) *source.Future[source.NamespaceLabelsResult] {
  1526. const queryFmtNamespaceLabels = `avg_over_time(kube_namespace_labels{%s}[%s])`
  1527. // env.GetPromClusterFilter(), durStr
  1528. cfg := pds.promConfig
  1529. durStr := timeutil.DurationString(end.Sub(start))
  1530. if durStr == "" {
  1531. panic("failed to parse duration string passed to QueryNamespaceLabels")
  1532. }
  1533. queryNamespaceLabels := fmt.Sprintf(queryFmtNamespaceLabels, cfg.ClusterFilter, durStr)
  1534. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1535. return source.NewFuture(source.DecodeNamespaceLabelsResult, ctx.QueryAtTime(queryNamespaceLabels, end))
  1536. }
  1537. func (pds *PrometheusDataSource) QueryNamespaceAnnotations(start, end time.Time) *source.Future[source.NamespaceAnnotationsResult] {
  1538. const queryFmtNamespaceAnnotations = `avg_over_time(kube_namespace_annotations{%s}[%s])`
  1539. // env.GetPromClusterFilter(), durStr
  1540. cfg := pds.promConfig
  1541. durStr := timeutil.DurationString(end.Sub(start))
  1542. if durStr == "" {
  1543. panic("failed to parse duration string passed to QueryNamespaceAnnotations")
  1544. }
  1545. queryNamespaceAnnotations := fmt.Sprintf(queryFmtNamespaceAnnotations, cfg.ClusterFilter, durStr)
  1546. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1547. return source.NewFuture(source.DecodeNamespaceAnnotationsResult, ctx.QueryAtTime(queryNamespaceAnnotations, end))
  1548. }
  1549. func (pds *PrometheusDataSource) QueryPodLabels(start, end time.Time) *source.Future[source.PodLabelsResult] {
  1550. const queryFmtPodLabels = `avg_over_time(kube_pod_labels{%s}[%s])`
  1551. // env.GetPromClusterFilter(), durStr
  1552. cfg := pds.promConfig
  1553. durStr := timeutil.DurationString(end.Sub(start))
  1554. if durStr == "" {
  1555. panic("failed to parse duration string passed to QueryPodLabels")
  1556. }
  1557. queryPodLabels := fmt.Sprintf(queryFmtPodLabels, cfg.ClusterFilter, durStr)
  1558. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1559. return source.NewFuture(source.DecodePodLabelsResult, ctx.QueryAtTime(queryPodLabels, end))
  1560. }
  1561. func (pds *PrometheusDataSource) QueryPodAnnotations(start, end time.Time) *source.Future[source.PodAnnotationsResult] {
  1562. const queryFmtPodAnnotations = `avg_over_time(kube_pod_annotations{%s}[%s])`
  1563. // env.GetPromClusterFilter(), durStr
  1564. cfg := pds.promConfig
  1565. durStr := timeutil.DurationString(end.Sub(start))
  1566. if durStr == "" {
  1567. panic("failed to parse duration string passed to QueryPodAnnotations")
  1568. }
  1569. queryPodAnnotations := fmt.Sprintf(queryFmtPodAnnotations, cfg.ClusterFilter, durStr)
  1570. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1571. return source.NewFuture(source.DecodePodAnnotationsResult, ctx.QueryAtTime(queryPodAnnotations, end))
  1572. }
  1573. func (pds *PrometheusDataSource) QueryServiceLabels(start, end time.Time) *source.Future[source.ServiceLabelsResult] {
  1574. const queryFmtServiceLabels = `avg_over_time(service_selector_labels{%s}[%s])`
  1575. // env.GetPromClusterFilter(), durStr
  1576. cfg := pds.promConfig
  1577. durStr := timeutil.DurationString(end.Sub(start))
  1578. if durStr == "" {
  1579. panic("failed to parse duration string passed to QueryServiceLabels")
  1580. }
  1581. queryServiceLabels := fmt.Sprintf(queryFmtServiceLabels, cfg.ClusterFilter, durStr)
  1582. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1583. return source.NewFuture(source.DecodeServiceLabelsResult, ctx.QueryAtTime(queryServiceLabels, end))
  1584. }
  1585. func (pds *PrometheusDataSource) QueryDeploymentLabels(start, end time.Time) *source.Future[source.DeploymentLabelsResult] {
  1586. const queryFmtDeploymentLabels = `avg_over_time(deployment_match_labels{%s}[%s])`
  1587. // env.GetPromClusterFilter(), durStr
  1588. cfg := pds.promConfig
  1589. durStr := timeutil.DurationString(end.Sub(start))
  1590. if durStr == "" {
  1591. panic("failed to parse duration string passed to QueryNamespaceAnnotations")
  1592. }
  1593. queryDeploymentLabels := fmt.Sprintf(queryFmtDeploymentLabels, cfg.ClusterFilter, durStr)
  1594. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1595. return source.NewFuture(source.DecodeDeploymentLabelsResult, ctx.QueryAtTime(queryDeploymentLabels, end))
  1596. }
  1597. func (pds *PrometheusDataSource) QueryStatefulSetLabels(start, end time.Time) *source.Future[source.StatefulSetLabelsResult] {
  1598. const queryFmtStatefulSetLabels = `avg_over_time(statefulSet_match_labels{%s}[%s])`
  1599. // env.GetPromClusterFilter(), durStr
  1600. cfg := pds.promConfig
  1601. durStr := timeutil.DurationString(end.Sub(start))
  1602. if durStr == "" {
  1603. panic("failed to parse duration string passed to QueryStatefulSetLabels")
  1604. }
  1605. queryStatefulSetLabels := fmt.Sprintf(queryFmtStatefulSetLabels, cfg.ClusterFilter, durStr)
  1606. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1607. return source.NewFuture(source.DecodeStatefulSetLabelsResult, ctx.QueryAtTime(queryStatefulSetLabels, end))
  1608. }
  1609. func (pds *PrometheusDataSource) QueryDaemonSetLabels(start, end time.Time) *source.Future[source.DaemonSetLabelsResult] {
  1610. const queryFmtDaemonSetLabels = `sum(avg_over_time(kube_pod_owner{owner_kind="DaemonSet", %s}[%s])) by (pod, owner_name, namespace, %s)`
  1611. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1612. cfg := pds.promConfig
  1613. durStr := timeutil.DurationString(end.Sub(start))
  1614. if durStr == "" {
  1615. panic("failed to parse duration string passed to QueryDaemonSetLabels")
  1616. }
  1617. queryDaemonSetLabels := fmt.Sprintf(queryFmtDaemonSetLabels, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1618. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1619. return source.NewFuture(source.DecodeDaemonSetLabelsResult, ctx.QueryAtTime(queryDaemonSetLabels, end))
  1620. }
  1621. func (pds *PrometheusDataSource) QueryJobLabels(start, end time.Time) *source.Future[source.JobLabelsResult] {
  1622. const queryFmtJobLabels = `sum(avg_over_time(kube_pod_owner{owner_kind="Job", %s}[%s])) by (pod, owner_name, namespace ,%s)`
  1623. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1624. cfg := pds.promConfig
  1625. durStr := timeutil.DurationString(end.Sub(start))
  1626. if durStr == "" {
  1627. panic("failed to parse duration string passed to QueryJobLabels")
  1628. }
  1629. queryJobLabels := fmt.Sprintf(queryFmtJobLabels, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1630. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1631. return source.NewFuture(source.DecodeJobLabelsResult, ctx.QueryAtTime(queryJobLabels, end))
  1632. }
  1633. func (pds *PrometheusDataSource) QueryPodsWithReplicaSetOwner(start, end time.Time) *source.Future[source.PodsWithReplicaSetOwnerResult] {
  1634. const queryFmtPodsWithReplicaSetOwner = `sum(avg_over_time(kube_pod_owner{owner_kind="ReplicaSet", %s}[%s])) by (pod, owner_name, namespace ,%s)`
  1635. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1636. cfg := pds.promConfig
  1637. durStr := timeutil.DurationString(end.Sub(start))
  1638. if durStr == "" {
  1639. panic("failed to parse duration string passed to QueryPodsWithReplicaSetOwner")
  1640. }
  1641. queryPodsWithReplicaSetOwner := fmt.Sprintf(queryFmtPodsWithReplicaSetOwner, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1642. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1643. return source.NewFuture(source.DecodePodsWithReplicaSetOwnerResult, ctx.QueryAtTime(queryPodsWithReplicaSetOwner, end))
  1644. }
  1645. func (pds *PrometheusDataSource) QueryReplicaSetsWithoutOwners(start, end time.Time) *source.Future[source.ReplicaSetsWithoutOwnersResult] {
  1646. const queryFmtReplicaSetsWithoutOwners = `avg(avg_over_time(kube_replicaset_owner{owner_kind="<none>", owner_name="<none>", %s}[%s])) by (replicaset, namespace, %s)`
  1647. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1648. cfg := pds.promConfig
  1649. durStr := timeutil.DurationString(end.Sub(start))
  1650. if durStr == "" {
  1651. panic("failed to parse duration string passed to QueryReplicaSetsWithoutOwners")
  1652. }
  1653. queryReplicaSetsWithoutOwners := fmt.Sprintf(queryFmtReplicaSetsWithoutOwners, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1654. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1655. return source.NewFuture(source.DecodeReplicaSetsWithoutOwnersResult, ctx.QueryAtTime(queryReplicaSetsWithoutOwners, end))
  1656. }
  1657. func (pds *PrometheusDataSource) QueryReplicaSetsWithRollout(start, end time.Time) *source.Future[source.ReplicaSetsWithRolloutResult] {
  1658. const queryFmtReplicaSetsWithRolloutOwner = `avg(avg_over_time(kube_replicaset_owner{owner_kind="Rollout", %s}[%s])) by (replicaset, namespace, owner_kind, owner_name, %s)`
  1659. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1660. cfg := pds.promConfig
  1661. durStr := timeutil.DurationString(end.Sub(start))
  1662. if durStr == "" {
  1663. panic("failed to parse duration string passed to QueryReplicaSetsWithRollout")
  1664. }
  1665. queryReplicaSetsWithRolloutOwner := fmt.Sprintf(queryFmtReplicaSetsWithRolloutOwner, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1666. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1667. return source.NewFuture(source.DecodeReplicaSetsWithRolloutResult, ctx.QueryAtTime(queryReplicaSetsWithRolloutOwner, end))
  1668. }
  1669. func (pds *PrometheusDataSource) QueryDataCoverage(limitDays int) (time.Time, time.Time, error) {
  1670. const (
  1671. queryFmtOldestSample = `min_over_time(timestamp(group(node_cpu_hourly_cost{%s}))[%s:%s])`
  1672. queryFmtNewestSample = `max_over_time(timestamp(group(node_cpu_hourly_cost{%s}))[%s:%s])`
  1673. )
  1674. cfg := pds.promConfig
  1675. now := time.Now()
  1676. durStr := fmt.Sprintf("%dd", limitDays)
  1677. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  1678. queryOldest := fmt.Sprintf(queryFmtOldestSample, cfg.ClusterFilter, durStr, "1h")
  1679. resOldestFut := ctx.QueryAtTime(queryOldest, now)
  1680. resOldest, err := resOldestFut.Await()
  1681. if err != nil {
  1682. return time.Time{}, time.Time{}, fmt.Errorf("querying oldest sample: %w", err)
  1683. }
  1684. if len(resOldest) == 0 || len(resOldest[0].Values) == 0 {
  1685. return time.Time{}, time.Time{}, fmt.Errorf("querying oldest sample: %w", err)
  1686. }
  1687. oldest := time.Unix(int64(resOldest[0].Values[0].Value), 0)
  1688. queryNewest := fmt.Sprintf(queryFmtNewestSample, cfg.ClusterFilter, durStr, "1h")
  1689. resNewestFut := ctx.QueryAtTime(queryNewest, now)
  1690. resNewest, err := resNewestFut.Await()
  1691. if err != nil {
  1692. return time.Time{}, time.Time{}, fmt.Errorf("querying newest sample: %w", err)
  1693. }
  1694. if len(resNewest) == 0 || len(resNewest[0].Values) == 0 {
  1695. return time.Time{}, time.Time{}, fmt.Errorf("querying newest sample: %w", err)
  1696. }
  1697. newest := time.Unix(int64(resNewest[0].Values[0].Value), 0)
  1698. return oldest, newest, nil
  1699. }
  1700. func newEmptyResult[T any](decoder source.ResultDecoder[T]) *source.Future[T] {
  1701. ch := make(source.QueryResultsChan)
  1702. go func() {
  1703. results := source.NewQueryResults("")
  1704. ch <- results
  1705. }()
  1706. return source.NewFuture(decoder, ch)
  1707. }
  1708. func wrapResults[T any](query string, decoder source.ResultDecoder[T], results []*source.QueryResult) *source.Future[T] {
  1709. ch := make(source.QueryResultsChan)
  1710. go func() {
  1711. r := source.NewQueryResults(query)
  1712. r.Results = results
  1713. ch <- r
  1714. }()
  1715. return source.NewFuture(decoder, ch)
  1716. }