datasource.go 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992
  1. package prom
  2. import (
  3. "context"
  4. "fmt"
  5. "math"
  6. "net/http"
  7. "strconv"
  8. "strings"
  9. "time"
  10. "github.com/julienschmidt/httprouter"
  11. "github.com/opencost/opencost/modules/prometheus-source/pkg/env"
  12. "github.com/opencost/opencost/pkg/prom"
  13. "github.com/opencost/opencost/core/pkg/log"
  14. "github.com/opencost/opencost/core/pkg/protocol"
  15. "github.com/opencost/opencost/core/pkg/source"
  16. "github.com/opencost/opencost/core/pkg/util/httputil"
  17. "github.com/opencost/opencost/core/pkg/util/json"
  18. "github.com/opencost/opencost/core/pkg/util/timeutil"
  19. prometheus "github.com/prometheus/client_golang/api"
  20. prometheusAPI "github.com/prometheus/client_golang/api/prometheus/v1"
  21. )
  22. const (
  23. apiPrefix = "/api/v1"
  24. epAlertManagers = apiPrefix + "/alertmanagers"
  25. epLabelValues = apiPrefix + "/label/:name/values"
  26. epSeries = apiPrefix + "/series"
  27. epTargets = apiPrefix + "/targets"
  28. epSnapshot = apiPrefix + "/admin/tsdb/snapshot"
  29. epDeleteSeries = apiPrefix + "/admin/tsdb/delete_series"
  30. epCleanTombstones = apiPrefix + "/admin/tsdb/clean_tombstones"
  31. epConfig = apiPrefix + "/status/config"
  32. epFlags = apiPrefix + "/status/flags"
  33. epRules = apiPrefix + "/rules"
  34. )
  35. // helper for query range proxy requests
  36. func toStartEndStep(qp httputil.QueryParams) (start, end time.Time, step time.Duration, err error) {
  37. var e error
  38. ss := qp.Get("start", "")
  39. es := qp.Get("end", "")
  40. ds := qp.Get("duration", "")
  41. layout := "2006-01-02T15:04:05.000Z"
  42. start, e = time.Parse(layout, ss)
  43. if e != nil {
  44. err = fmt.Errorf("Error parsing time %s. Error: %s", ss, err)
  45. return
  46. }
  47. end, e = time.Parse(layout, es)
  48. if e != nil {
  49. err = fmt.Errorf("Error parsing time %s. Error: %s", es, err)
  50. return
  51. }
  52. step, e = time.ParseDuration(ds)
  53. if e != nil {
  54. err = fmt.Errorf("Error parsing duration %s. Error: %s", ds, err)
  55. return
  56. }
  57. err = nil
  58. return
  59. }
  60. // FIXME: Before merge, implement a more robust design. This is brittle and bug-prone,
  61. // FIXME: but decouples the prom requirements from the Provider implementations.
  62. var providerStorageQueries = map[string]func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string{
  63. "aws": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  64. return ""
  65. },
  66. "gcp": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  67. // TODO Set to the price for the appropriate storage class. It's not trivial to determine the local storage disk type
  68. // See https://cloud.google.com/compute/disks-image-pricing#persistentdisk
  69. localStorageCost := 0.04
  70. baseMetric := "container_fs_limit_bytes"
  71. if used {
  72. baseMetric = "container_fs_usage_bytes"
  73. }
  74. fmtCumulativeQuery := `sum(
  75. sum_over_time(%s{device!="tmpfs", id="/", %s}[%s:1m])
  76. ) by (%s) / 60 / 730 / 1024 / 1024 / 1024 * %f`
  77. fmtMonthlyQuery := `sum(
  78. avg_over_time(%s{device!="tmpfs", id="/", %s}[%s:1m])
  79. ) by (%s) / 1024 / 1024 / 1024 * %f`
  80. fmtQuery := fmtCumulativeQuery
  81. if rate {
  82. fmtQuery = fmtMonthlyQuery
  83. }
  84. fmtWindow := timeutil.DurationString(end.Sub(start))
  85. return fmt.Sprintf(fmtQuery, baseMetric, config.ClusterFilter, fmtWindow, config.ClusterLabel, localStorageCost)
  86. },
  87. "azure": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  88. return ""
  89. },
  90. "alibaba": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  91. return ""
  92. },
  93. "scaleway": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  94. return ""
  95. },
  96. "otc": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  97. return ""
  98. },
  99. "oracle": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  100. return ""
  101. },
  102. "csv": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  103. return ""
  104. },
  105. "custom": func(config *OpenCostPrometheusConfig, start, end time.Time, rate bool, used bool) string {
  106. return ""
  107. },
  108. }
  109. // creates a new help error which indicates the caller can retry and is non-fatal.
  110. func newHelpRetryError(format string, args ...any) error {
  111. formatWithHelp := format + "\nTroubleshooting help available at: %s"
  112. args = append(args, PrometheusTroubleshootingURL)
  113. cause := fmt.Errorf(formatWithHelp, args...)
  114. return source.NewHelpRetryError(cause)
  115. }
  116. // PrometheusDataSource is the OpenCost data source implementation leveraging Prometheus. Prometheus provides longer retention periods and
  117. // more detailed metrics than the OpenCost Collector, which is useful for historical analysis and cost forecasting.
  118. type PrometheusDataSource struct {
  119. promConfig *OpenCostPrometheusConfig
  120. promClient prometheus.Client
  121. promContexts *ContextFactory
  122. thanosConfig *OpenCostThanosConfig
  123. thanosClient prometheus.Client
  124. thanosContexts *ContextFactory
  125. }
  126. // NewDefaultPrometheusDataSource creates and initializes a new `PrometheusDataSource` with configuration
  127. // parsed from environment variables. This function will block until a connection to prometheus is established,
  128. // or fails. It is recommended to run this function in a goroutine on a retry cycle.
  129. func NewDefaultPrometheusDataSource() (*PrometheusDataSource, error) {
  130. config, err := NewOpenCostPrometheusConfigFromEnv()
  131. if err != nil {
  132. return nil, fmt.Errorf("failed to create prometheus config from env: %w", err)
  133. }
  134. var thanosConfig *OpenCostThanosConfig
  135. if env.IsThanosEnabled() {
  136. // thanos initialization is not fatal, so we log the error and continue
  137. thanosConfig, err = NewOpenCostThanosConfigFromEnv()
  138. if err != nil {
  139. log.Warnf("Thanos was enabled, but failed to create thanos config from env: %s. Continuing...", err.Error())
  140. }
  141. }
  142. return NewPrometheusDataSource(config, thanosConfig)
  143. }
  144. // NewPrometheusDataSource initializes clients for Prometheus and Thanos, and returns a new PrometheusDataSource.
  145. func NewPrometheusDataSource(promConfig *OpenCostPrometheusConfig, thanosConfig *OpenCostThanosConfig) (*PrometheusDataSource, error) {
  146. promClient, err := NewPrometheusClient(promConfig.ServerEndpoint, promConfig.ClientConfig)
  147. if err != nil {
  148. return nil, fmt.Errorf("failed to build prometheus client: %w", err)
  149. }
  150. // validation of the prometheus client
  151. m, err := Validate(promClient, promConfig)
  152. if err != nil || !m.Running {
  153. if err != nil {
  154. return nil, newHelpRetryError("failed to query prometheus at %s: %w", promConfig.ServerEndpoint, err)
  155. } else if !m.Running {
  156. return nil, newHelpRetryError("prometheus at %s is not running", promConfig.ServerEndpoint)
  157. }
  158. } else {
  159. log.Infof("Success: retrieved the 'up' query against prometheus at: %s", promConfig.ServerEndpoint)
  160. }
  161. // we don't consider this a fatal error, but we log for visibility
  162. api := prometheusAPI.NewAPI(promClient)
  163. _, err = api.Buildinfo(context.Background())
  164. if err != nil {
  165. log.Infof("No valid prometheus config file at %s. Error: %s.\nTroubleshooting help available at: %s.\n**Ignore if using cortex/mimir/thanos here**", promConfig.ServerEndpoint, err.Error(), PrometheusTroubleshootingURL)
  166. } else {
  167. log.Infof("Retrieved a prometheus config file from: %s", promConfig.ServerEndpoint)
  168. }
  169. // Fix scrape interval if zero by attempting to lookup the interval for the configured job
  170. if promConfig.ScrapeInterval == 0 {
  171. promConfig.ScrapeInterval = time.Minute
  172. // Lookup scrape interval for kubecost job, update if found
  173. si, err := ScrapeIntervalFor(promClient, promConfig.JobName)
  174. if err == nil {
  175. promConfig.ScrapeInterval = si
  176. }
  177. }
  178. log.Infof("Using scrape interval of %f", promConfig.ScrapeInterval.Seconds())
  179. promContexts := NewContextFactory(promClient, promConfig)
  180. var thanosClient prometheus.Client
  181. var thanosContexts *ContextFactory
  182. // if the thanos configuration is non-nil, we assume intent to use thanos. However, failure to
  183. // initialize the thanos client is not fatal, and we will log the error and continue.
  184. if thanosConfig != nil {
  185. thanosHost := thanosConfig.ServerEndpoint
  186. if thanosHost != "" {
  187. thanosCli, _ := NewThanosClient(thanosHost, thanosConfig)
  188. _, err = Validate(thanosCli, thanosConfig.OpenCostPrometheusConfig)
  189. if err != nil {
  190. log.Warnf("Failed to query Thanos at %s. Error: %s.", thanosHost, err.Error())
  191. thanosClient = thanosCli
  192. } else {
  193. log.Infof("Success: retrieved the 'up' query against Thanos at: %s", thanosHost)
  194. thanosClient = thanosCli
  195. }
  196. thanosContexts = NewContextFactory(thanosClient, thanosContexts.config)
  197. } else {
  198. log.Infof("Error resolving environment variable: $%s", env.ThanosQueryUrlEnvVar)
  199. }
  200. }
  201. return &PrometheusDataSource{
  202. promConfig: promConfig,
  203. promClient: promClient,
  204. promContexts: promContexts,
  205. thanosConfig: thanosConfig,
  206. thanosClient: thanosClient,
  207. thanosContexts: thanosContexts,
  208. }, nil
  209. }
  210. var proto = protocol.HTTP()
  211. // prometheusMetadata returns the metadata for the prometheus server
  212. func (pds *PrometheusDataSource) prometheusMetadata(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  213. w.Header().Set("Content-Type", "application/json")
  214. w.Header().Set("Access-Control-Allow-Origin", "*")
  215. resp := proto.ToResponse(Validate(pds.promClient, pds.promConfig))
  216. proto.WriteResponse(w, resp)
  217. }
  218. // prometheusRecordingRules is a proxy for /rules against prometheus
  219. func (pds *PrometheusDataSource) prometheusRecordingRules(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  220. w.Header().Set("Content-Type", "application/json")
  221. w.Header().Set("Access-Control-Allow-Origin", "*")
  222. u := pds.promClient.URL(epRules, nil)
  223. req, err := http.NewRequest(http.MethodGet, u.String(), nil)
  224. if err != nil {
  225. fmt.Fprintf(w, "Error creating Prometheus rule request: "+err.Error())
  226. }
  227. _, body, err := pds.promClient.Do(r.Context(), req)
  228. if err != nil {
  229. fmt.Fprintf(w, "Error making Prometheus rule request: "+err.Error())
  230. } else {
  231. w.Write(body)
  232. }
  233. }
  234. // prometheusConfig returns the current configuration of the prometheus server
  235. func (pds *PrometheusDataSource) prometheusConfig(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  236. w.Header().Set("Content-Type", "application/json")
  237. w.Header().Set("Access-Control-Allow-Origin", "*")
  238. pConfig := map[string]string{
  239. "address": pds.promConfig.ServerEndpoint,
  240. }
  241. body, err := json.Marshal(pConfig)
  242. if err != nil {
  243. fmt.Fprintf(w, "Error marshalling prometheus config")
  244. } else {
  245. w.Write(body)
  246. }
  247. }
  248. // prometheusTargets is a proxy for /targets against prometheus
  249. func (pds *PrometheusDataSource) prometheusTargets(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  250. w.Header().Set("Content-Type", "application/json")
  251. w.Header().Set("Access-Control-Allow-Origin", "*")
  252. u := pds.promClient.URL(epTargets, nil)
  253. req, err := http.NewRequest(http.MethodGet, u.String(), nil)
  254. if err != nil {
  255. fmt.Fprintf(w, "Error creating Prometheus rule request: "+err.Error())
  256. }
  257. _, body, err := pds.promClient.Do(r.Context(), req)
  258. if err != nil {
  259. fmt.Fprintf(w, "Error making Prometheus rule request: "+err.Error())
  260. } else {
  261. w.Write(body)
  262. }
  263. }
  264. // status returns the status of the prometheus client
  265. func (pds *PrometheusDataSource) status(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  266. w.Header().Set("Content-Type", "application/json")
  267. w.Header().Set("Access-Control-Allow-Origin", "*")
  268. promServer := pds.promConfig.ServerEndpoint
  269. api := prometheusAPI.NewAPI(pds.promClient)
  270. result, err := api.Buildinfo(r.Context())
  271. if err != nil {
  272. fmt.Fprintf(w, "Using Prometheus at "+promServer+". Error: "+err.Error())
  273. } else {
  274. fmt.Fprintf(w, "Using Prometheus at "+promServer+". Version: "+result.Version)
  275. }
  276. }
  277. // prometheusQuery is a proxy for /query against prometheus
  278. func (pds *PrometheusDataSource) prometheusQuery(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  279. w.Header().Set("Content-Type", "application/json")
  280. w.Header().Set("Access-Control-Allow-Origin", "*")
  281. qp := httputil.NewQueryParams(r.URL.Query())
  282. query := qp.Get("query", "")
  283. if query == "" {
  284. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Query Parameter 'query' is unset'")))
  285. return
  286. }
  287. // Attempt to parse time as either a unix timestamp or as an RFC3339 value
  288. var timeVal time.Time
  289. timeStr := qp.Get("time", "")
  290. if len(timeStr) > 0 {
  291. if t, err := strconv.ParseInt(timeStr, 10, 64); err == nil {
  292. timeVal = time.Unix(t, 0)
  293. } else if t, err := time.Parse(time.RFC3339, timeStr); err == nil {
  294. timeVal = t
  295. }
  296. // If time is given, but not parse-able, return an error
  297. if timeVal.IsZero() {
  298. http.Error(w, fmt.Sprintf("time must be a unix timestamp or RFC3339 value; illegal value given: %s", timeStr), http.StatusBadRequest)
  299. }
  300. }
  301. ctx := pds.promContexts.NewNamedContext(FrontendContextName)
  302. body, err := ctx.RawQuery(query, timeVal)
  303. if err != nil {
  304. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Error running query %s. Error: %s", query, err)))
  305. return
  306. }
  307. w.Write(body) // prometheusQueryRange is a proxy for /query_range against prometheus
  308. }
  309. func (pds *PrometheusDataSource) prometheusQueryRange(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  310. w.Header().Set("Content-Type", "application/json")
  311. w.Header().Set("Access-Control-Allow-Origin", "*")
  312. qp := httputil.NewQueryParams(r.URL.Query())
  313. query := qp.Get("query", "")
  314. if query == "" {
  315. fmt.Fprintf(w, "Error parsing query from request parameters.")
  316. return
  317. }
  318. start, end, duration, err := toStartEndStep(qp)
  319. if err != nil {
  320. fmt.Fprintf(w, err.Error())
  321. return
  322. }
  323. ctx := pds.promContexts.NewNamedContext(prom.FrontendContextName)
  324. body, err := ctx.RawQueryRange(query, start, end, duration)
  325. if err != nil {
  326. fmt.Fprintf(w, "Error running query %s. Error: %s", query, err)
  327. return
  328. }
  329. w.Write(body)
  330. }
  331. // thanosQuery is a proxy for /query against thanos
  332. func (pds *PrometheusDataSource) thanosQuery(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  333. w.Header().Set("Content-Type", "application/json")
  334. w.Header().Set("Access-Control-Allow-Origin", "*")
  335. if pds.thanosClient == nil {
  336. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("ThanosDisabled")))
  337. return
  338. }
  339. qp := httputil.NewQueryParams(r.URL.Query())
  340. query := qp.Get("query", "")
  341. if query == "" {
  342. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Query Parameter 'query' is unset'")))
  343. return
  344. }
  345. // Attempt to parse time as either a unix timestamp or as an RFC3339 value
  346. var timeVal time.Time
  347. timeStr := qp.Get("time", "")
  348. if len(timeStr) > 0 {
  349. if t, err := strconv.ParseInt(timeStr, 10, 64); err == nil {
  350. timeVal = time.Unix(t, 0)
  351. } else if t, err := time.Parse(time.RFC3339, timeStr); err == nil {
  352. timeVal = t
  353. }
  354. // If time is given, but not parse-able, return an error
  355. if timeVal.IsZero() {
  356. http.Error(w, fmt.Sprintf("time must be a unix timestamp or RFC3339 value; illegal value given: %s", timeStr), http.StatusBadRequest)
  357. }
  358. }
  359. ctx := pds.thanosContexts.NewNamedContext(FrontendContextName)
  360. body, err := ctx.RawQuery(query, timeVal)
  361. if err != nil {
  362. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("Error running query %s. Error: %s", query, err)))
  363. return
  364. }
  365. w.Write(body)
  366. }
  367. // thanosQueryRange is a proxy for /query_range against thanos
  368. func (pds *PrometheusDataSource) thanosQueryRange(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
  369. w.Header().Set("Content-Type", "application/json")
  370. w.Header().Set("Access-Control-Allow-Origin", "*")
  371. if pds.thanosClient == nil {
  372. proto.WriteResponse(w, proto.ToResponse(nil, fmt.Errorf("ThanosDisabled")))
  373. return
  374. }
  375. qp := httputil.NewQueryParams(r.URL.Query())
  376. query := qp.Get("query", "")
  377. if query == "" {
  378. fmt.Fprintf(w, "Error parsing query from request parameters.")
  379. return
  380. }
  381. start, end, duration, err := toStartEndStep(qp)
  382. if err != nil {
  383. fmt.Fprintf(w, err.Error())
  384. return
  385. }
  386. ctx := pds.thanosContexts.NewNamedContext(FrontendContextName)
  387. body, err := ctx.RawQueryRange(query, start, end, duration)
  388. if err != nil {
  389. fmt.Fprintf(w, "Error running query %s. Error: %s", query, err)
  390. return
  391. }
  392. w.Write(body)
  393. }
  394. // promtheusQueueState returns the current state of the prometheus and thanos request queues
  395. func (pds *PrometheusDataSource) prometheusQueueState(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  396. w.Header().Set("Content-Type", "application/json")
  397. w.Header().Set("Access-Control-Allow-Origin", "*")
  398. promQueueState, err := GetPrometheusQueueState(pds.promClient, pds.promConfig)
  399. if err != nil {
  400. proto.WriteResponse(w, proto.ToResponse(nil, err))
  401. return
  402. }
  403. result := map[string]*PrometheusQueueState{
  404. "prometheus": promQueueState,
  405. }
  406. if pds.thanosClient != nil {
  407. thanosQueueState, err := GetPrometheusQueueState(pds.thanosClient, pds.thanosConfig.OpenCostPrometheusConfig)
  408. if err != nil {
  409. log.Warnf("Error getting Thanos queue state: %s", err)
  410. } else {
  411. result["thanos"] = thanosQueueState
  412. }
  413. }
  414. proto.WriteResponse(w, proto.ToResponse(result, nil))
  415. }
  416. // prometheusMetrics retrieves availability of Prometheus and Thanos metrics
  417. func (pds *PrometheusDataSource) prometheusMetrics(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  418. w.Header().Set("Content-Type", "application/json")
  419. w.Header().Set("Access-Control-Allow-Origin", "*")
  420. promMetrics := GetPrometheusMetrics(pds.promClient, pds.promConfig, "")
  421. result := map[string][]*PrometheusDiagnostic{
  422. "prometheus": promMetrics,
  423. }
  424. if pds.thanosClient != nil {
  425. thanosMetrics := GetPrometheusMetrics(pds.thanosClient, pds.thanosConfig.OpenCostPrometheusConfig, pds.thanosConfig.Offset)
  426. result["thanos"] = thanosMetrics
  427. }
  428. proto.WriteResponse(w, proto.ToResponse(result, nil))
  429. }
  430. func (pds *PrometheusDataSource) RegisterEndPoints(router *httprouter.Router) {
  431. // endpoints migrated from server
  432. router.GET("/validatePrometheus", pds.prometheusMetadata)
  433. router.GET("/prometheusRecordingRules", pds.prometheusRecordingRules)
  434. router.GET("/prometheusConfig", pds.prometheusConfig)
  435. router.GET("/prometheusTargets", pds.prometheusTargets)
  436. router.GET("/status", pds.status)
  437. // prom query proxies
  438. router.GET("/prometheusQuery", pds.prometheusQuery)
  439. router.GET("/prometheusQueryRange", pds.prometheusQueryRange)
  440. router.GET("/thanosQuery", pds.thanosQuery)
  441. router.GET("/thanosQueryRange", pds.thanosQueryRange)
  442. // diagnostics
  443. router.GET("/diagnostics/requestQueue", pds.prometheusQueueState)
  444. router.GET("/diagnostics/prometheusMetrics", pds.prometheusMetrics)
  445. }
  446. func (pds *PrometheusDataSource) RefreshInterval() time.Duration {
  447. return pds.promConfig.ScrapeInterval
  448. }
  449. func (pds *PrometheusDataSource) BatchDuration() time.Duration {
  450. return pds.promConfig.MaxQueryDuration
  451. }
  452. func (pds *PrometheusDataSource) QueryRAMUsage(window string, offset string) source.QueryResultsChan {
  453. const ramUsageQuery = `avg(
  454. label_replace(
  455. label_replace(
  456. label_replace(
  457. sum_over_time(container_memory_working_set_bytes{container!="", container!="POD", instance!="", %s}[%s] %s), "node", "$1", "instance", "(.+)"
  458. ), "container_name", "$1", "container", "(.+)"
  459. ), "pod_name", "$1", "pod", "(.+)"
  460. )
  461. ) by (namespace, container_name, pod_name, node, %s)`
  462. // env.GetPromClusterFilter(), window, offset, env.GetPromClusterLabel())
  463. if offset != "" && !strings.Contains(offset, "offset") {
  464. offset = fmt.Sprintf("offset %s", offset)
  465. }
  466. cfg := pds.promConfig
  467. queryRAMUsage := fmt.Sprintf(ramUsageQuery, cfg.ClusterFilter, window, offset, cfg.ClusterLabel)
  468. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  469. return ctx.Query(queryRAMUsage)
  470. }
  471. func (pds *PrometheusDataSource) QueryCPUUsage(window string, offset string) source.QueryResultsChan {
  472. const cpuUsageQuery = `avg(
  473. label_replace(
  474. label_replace(
  475. label_replace(
  476. rate(
  477. container_cpu_usage_seconds_total{container!="", container!="POD", instance!="", %s}[%s] %s
  478. ), "node", "$1", "instance", "(.+)"
  479. ), "container_name", "$1", "container", "(.+)"
  480. ), "pod_name", "$1", "pod", "(.+)"
  481. )
  482. ) by (namespace, container_name, pod_name, node, %s)`
  483. // env.GetPromClusterFilter(), window, offset, env.GetPromClusterLabel())
  484. if offset != "" && !strings.Contains(offset, "offset") {
  485. offset = fmt.Sprintf("offset %s", offset)
  486. }
  487. cfg := pds.promConfig
  488. queryCPUUsage := fmt.Sprintf(cpuUsageQuery, cfg.ClusterFilter, window, offset, cfg.ClusterLabel)
  489. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  490. return ctx.Query(queryCPUUsage)
  491. }
  492. func (pds *PrometheusDataSource) QueryNetworkInZoneRequests(window string, offset string) source.QueryResultsChan {
  493. const zoneNetworkUsageQuery = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="true", %s}[%s] %s)) by (namespace,pod_name,%s) / 1024 / 1024 / 1024`
  494. // env.GetPromClusterFilter(), window, "", env.GetPromClusterLabel())
  495. if offset != "" && !strings.Contains(offset, "offset") {
  496. offset = fmt.Sprintf("offset %s", offset)
  497. }
  498. cfg := pds.promConfig
  499. queryZoneNetworkUsage := fmt.Sprintf(zoneNetworkUsageQuery, cfg.ClusterFilter, window, offset, cfg.ClusterLabel)
  500. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  501. return ctx.Query(queryZoneNetworkUsage)
  502. }
  503. func (pds *PrometheusDataSource) QueryNetworkInRegionRequests(window string, offset string) source.QueryResultsChan {
  504. const regionNetworkUsageQuery = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="false", %s}[%s] %s)) by (namespace,pod_name,%s) / 1024 / 1024 / 1024`
  505. // env.GetPromClusterFilter(), window, "", env.GetPromClusterLabel())
  506. if offset != "" && !strings.Contains(offset, "offset") {
  507. offset = fmt.Sprintf("offset %s", offset)
  508. }
  509. cfg := pds.promConfig
  510. queryRegionNetworkUsage := fmt.Sprintf(regionNetworkUsageQuery, cfg.ClusterFilter, window, offset, cfg.ClusterLabel)
  511. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  512. return ctx.Query(queryRegionNetworkUsage)
  513. }
  514. func (pds *PrometheusDataSource) QueryNetworkInternetRequests(window string, offset string) source.QueryResultsChan {
  515. const internetNetworkUsageQuery = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true", %s}[%s] %s)) by (namespace,pod_name,%s) / 1024 / 1024 / 1024`
  516. // env.GetPromClusterFilter(), window, "", env.GetPromClusterLabel())
  517. cfg := pds.promConfig
  518. queryInternetNetworkUsage := fmt.Sprintf(internetNetworkUsageQuery, cfg.ClusterFilter, window, offset, cfg.ClusterLabel)
  519. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  520. return ctx.Query(queryInternetNetworkUsage)
  521. }
  522. func (pds *PrometheusDataSource) QueryNormalization(window string, offset string) source.QueryResultsChan {
  523. const normalizationQuery = `max(count_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", %s}[%s] %s))`
  524. // env.GetPromClusterFilter(), window, offset)
  525. if offset != "" && !strings.Contains(offset, "offset") {
  526. offset = fmt.Sprintf("offset %s", offset)
  527. }
  528. cfg := pds.promConfig
  529. queryNormalization := fmt.Sprintf(normalizationQuery, cfg.ClusterFilter, window, offset)
  530. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  531. return ctx.Query(queryNormalization)
  532. }
  533. func (pds *PrometheusDataSource) QueryHistoricalCPUCost(window string, offset string) source.QueryResultsChan {
  534. const historicalCPUCostQuery = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s] %s)) by (node, instance, %s)`
  535. // env.GetPromClusterFilter(), window, offsetStr, env.GetPromClusterLabel())
  536. if offset != "" && !strings.Contains(offset, "offset") {
  537. offset = fmt.Sprintf("offset %s", offset)
  538. }
  539. cfg := pds.promConfig
  540. queryHistoricalCPUCost := fmt.Sprintf(historicalCPUCostQuery, cfg.ClusterFilter, window, offset, cfg.ClusterLabel)
  541. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  542. return ctx.Query(queryHistoricalCPUCost)
  543. }
  544. func (pds *PrometheusDataSource) QueryHistoricalRAMCost(window string, offset string) source.QueryResultsChan {
  545. const historicalRAMCostQuery = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s] %s)) by (node, instance, %s)`
  546. // env.GetPromClusterFilter(), window, offsetStr, env.GetPromClusterLabel())
  547. if offset != "" && !strings.Contains(offset, "offset") {
  548. offset = fmt.Sprintf("offset %s", offset)
  549. }
  550. cfg := pds.promConfig
  551. queryHistoricalRAMCost := fmt.Sprintf(historicalRAMCostQuery, cfg.ClusterFilter, window, offset, cfg.ClusterLabel)
  552. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  553. return ctx.Query(queryHistoricalRAMCost)
  554. }
  555. func (pds *PrometheusDataSource) QueryHistoricalGPUCost(window string, offset string) source.QueryResultsChan {
  556. const historicalGPUCostQuery = `avg(avg_over_time(node_gpu_hourly_cost{%s}[%s] %s)) by (node, instance, %s)`
  557. // env.GetPromClusterFilter(), window, offsetStr, env.GetPromClusterLabel())
  558. if offset != "" && !strings.Contains(offset, "offset") {
  559. offset = fmt.Sprintf("offset %s", offset)
  560. }
  561. cfg := pds.promConfig
  562. queryHistoricalGPUCost := fmt.Sprintf(historicalGPUCostQuery, cfg.ClusterFilter, window, offset, cfg.ClusterLabel)
  563. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  564. return ctx.Query(queryHistoricalGPUCost)
  565. }
  566. func (pds *PrometheusDataSource) QueryHistoricalPodLabels(window string, offset string) source.QueryResultsChan {
  567. const historicalPodLabelsQuery = `kube_pod_labels{%s}[%s] %s`
  568. // env.GetPromClusterFilter(), window, offset
  569. if offset != "" && !strings.Contains(offset, "offset") {
  570. offset = fmt.Sprintf("offset %s", offset)
  571. }
  572. cfg := pds.promConfig
  573. queryHistoricalPodLabels := fmt.Sprintf(historicalPodLabelsQuery, cfg.ClusterFilter, window, offset)
  574. ctx := pds.promContexts.NewNamedContext(ComputeCostDataContextName)
  575. return ctx.Query(queryHistoricalPodLabels)
  576. }
  577. func (pds *PrometheusDataSource) QueryRAMRequestsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  578. const ramRequestsQuery = `avg(
  579. label_replace(
  580. label_replace(
  581. sum_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", container!="",container!="POD", node!="", %s}[%s] %s)
  582. , "container_name","$1","container","(.+)"
  583. ), "pod_name","$1","pod","(.+)"
  584. )
  585. ) by (namespace,container_name,pod_name,node,%s)`
  586. // env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
  587. cfg := pds.promConfig
  588. resolution = snapResolutionMinute(resolution)
  589. resMins := int64(resolution.Minutes())
  590. resStr := formatResolutionMinutes(resMins)
  591. queryRAMRequests := fmt.Sprintf(ramRequestsQuery, cfg.ClusterFilter, resStr, "", cfg.ClusterLabel)
  592. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  593. return ctx.QueryRange(queryRAMRequests, start, end, resolution)
  594. }
  595. func (pds *PrometheusDataSource) QueryRAMUsageOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  596. const ramUsageQuery = `avg(
  597. label_replace(
  598. label_replace(
  599. label_replace(
  600. sum_over_time(container_memory_working_set_bytes{container!="", container!="POD", instance!="", %s}[%s] %s), "node", "$1", "instance", "(.+)"
  601. ), "container_name", "$1", "container", "(.+)"
  602. ), "pod_name", "$1", "pod", "(.+)"
  603. )
  604. ) by (namespace, container_name, pod_name, node, %s)`
  605. // env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
  606. cfg := pds.promConfig
  607. resolution = snapResolutionMinute(resolution)
  608. resMins := int64(resolution.Minutes())
  609. resStr := formatResolutionMinutes(resMins)
  610. queryRAMUsage := fmt.Sprintf(ramUsageQuery, cfg.ClusterFilter, resStr, "", cfg.ClusterLabel)
  611. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  612. return ctx.QueryRange(queryRAMUsage, start, end, resolution)
  613. }
  614. func (pds *PrometheusDataSource) QueryRAMAllocationOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  615. // ramAllocationByteHoursQuery yields the total byte-hour RAM allocation over the given
  616. // window, aggregated by container.
  617. // [line 3] sum_over_time(each byte) = [byte*scrape] by metric
  618. // [line 4] (scalar(avg(prometheus_target_interval_length_seconds)) = [seconds/scrape] / 60 / 60 = [hours/scrape] by container
  619. // [lines 2,4] sum(") by unique container key and multiply [byte*scrape] * [hours/scrape] for byte*hours
  620. // [lines 1,5] relabeling
  621. const ramAllocationByteHoursQuery = `
  622. label_replace(label_replace(
  623. sum(
  624. sum_over_time(container_memory_allocation_bytes{container!="",container!="POD", node!="", %s}[%s])
  625. ) by (namespace,container,pod,node,%s) * %f / 60 / 60
  626. , "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
  627. // env.GetPromClusterFilter(), resStr, env.GetPromClusterLabel(), scrapeIntervalSeconds)
  628. cfg := pds.promConfig
  629. resolution = snapResolutionMinute(resolution)
  630. resMins := int64(resolution.Minutes())
  631. resStr := formatResolutionMinutes(resMins)
  632. scrapeIntervalSeconds := cfg.ScrapeInterval.Seconds()
  633. queryRAMAllocationByteHours := fmt.Sprintf(ramAllocationByteHoursQuery, cfg.ClusterFilter, resStr, cfg.ClusterLabel, scrapeIntervalSeconds)
  634. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  635. return ctx.QueryRange(queryRAMAllocationByteHours, start, end, resolution)
  636. }
  637. func (pds *PrometheusDataSource) QueryCPURequestsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  638. const cpuRequestsQuery = `avg(
  639. label_replace(
  640. label_replace(
  641. sum_over_time(kube_pod_container_resource_requests{resource="cpu", unit="core", container!="",container!="POD", node!="", %s}[%s] %s)
  642. , "container_name","$1","container","(.+)"
  643. ), "pod_name","$1","pod","(.+)"
  644. )
  645. ) by (namespace,container_name,pod_name,node,%s)`
  646. // env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
  647. cfg := pds.promConfig
  648. resolution = snapResolutionMinute(resolution)
  649. resMins := int64(resolution.Minutes())
  650. resStr := formatResolutionMinutes(resMins)
  651. queryCPURequests := fmt.Sprintf(cpuRequestsQuery, cfg.ClusterFilter, resStr, "", cfg.ClusterLabel)
  652. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  653. return ctx.QueryRange(queryCPURequests, start, end, resolution)
  654. }
  655. func (pds *PrometheusDataSource) QueryCPUUsageOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  656. const cpuUsageQuery = `avg(
  657. label_replace(
  658. label_replace(
  659. label_replace(
  660. rate(
  661. container_cpu_usage_seconds_total{container!="", container!="POD", instance!="", %s}[%s] %s
  662. ), "node", "$1", "instance", "(.+)"
  663. ), "container_name", "$1", "container", "(.+)"
  664. ), "pod_name", "$1", "pod", "(.+)"
  665. )
  666. ) by (namespace, container_name, pod_name, node, %s)`
  667. // env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
  668. cfg := pds.promConfig
  669. resolution = snapResolutionMinute(resolution)
  670. resMins := int64(resolution.Minutes())
  671. resStr := formatResolutionMinutes(resMins)
  672. queryCPUUsage := fmt.Sprintf(cpuUsageQuery, cfg.ClusterFilter, resStr, "", cfg.ClusterLabel)
  673. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  674. return ctx.QueryRange(queryCPUUsage, start, end, resolution)
  675. }
  676. func (pds *PrometheusDataSource) QueryCPUAllocationOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  677. // cpuAllocationQuery yields the total VCPU-hour CPU allocation over the given
  678. // window, aggregated by container.
  679. // [line 3] sum_over_time(each VCPU*mins in window) = [VCPU*scrape] by metric
  680. // [line 4] (scalar(avg(prometheus_target_interval_length_seconds)) = [seconds/scrape] / 60 / 60 = [hours/scrape] by container
  681. // [lines 2,4] sum(") by unique container key and multiply [VCPU*scrape] * [hours/scrape] for VCPU*hours
  682. // [lines 1,5] relabeling
  683. const cpuAllocationQuery = `
  684. label_replace(label_replace(
  685. sum(
  686. sum_over_time(container_cpu_allocation{container!="",container!="POD", node!="", %s}[%s])
  687. ) by (namespace,container,pod,node,%s) * %f / 60 / 60
  688. , "container_name","$1","container","(.+)"), "pod_name","$1","pod","(.+)")`
  689. // env.GetPromClusterFilter(), resStr, env.GetPromClusterLabel(), scrapeIntervalSeconds)
  690. cfg := pds.promConfig
  691. resolution = snapResolutionMinute(resolution)
  692. resMins := int64(resolution.Minutes())
  693. resStr := formatResolutionMinutes(resMins)
  694. scrapeIntervalSeconds := cfg.ScrapeInterval.Seconds()
  695. queryCPUAllocation := fmt.Sprintf(cpuAllocationQuery, cfg.ClusterFilter, resStr, cfg.ClusterLabel, scrapeIntervalSeconds)
  696. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  697. return ctx.QueryRange(queryCPUAllocation, start, end, resolution)
  698. }
  699. func (pds *PrometheusDataSource) QueryGPURequestsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  700. const gpuRequestsQuery = `avg(
  701. label_replace(
  702. label_replace(
  703. sum_over_time(kube_pod_container_resource_requests{resource="nvidia_com_gpu", container!="",container!="POD", node!="", %s}[%s] %s),
  704. "container_name","$1","container","(.+)"
  705. ), "pod_name","$1","pod","(.+)"
  706. )
  707. ) by (namespace,container_name,pod_name,node,%s)`
  708. // env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
  709. cfg := pds.promConfig
  710. resolution = snapResolutionMinute(resolution)
  711. resMins := int64(resolution.Minutes())
  712. resStr := formatResolutionMinutes(resMins)
  713. queryGPURequests := fmt.Sprintf(gpuRequestsQuery, cfg.ClusterFilter, resStr, "", cfg.ClusterLabel)
  714. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  715. return ctx.QueryRange(queryGPURequests, start, end, resolution)
  716. }
  717. func (pds *PrometheusDataSource) QueryPVRequestsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  718. const pvRequestsQuery = `avg(avg(kube_persistentvolumeclaim_info{volumename != "", %s}) by (persistentvolumeclaim, storageclass, namespace, volumename, %s, kubernetes_node)
  719. *
  720. on (persistentvolumeclaim, namespace, %s, kubernetes_node) group_right(storageclass, volumename)
  721. sum(kube_persistentvolumeclaim_resource_requests_storage_bytes{%s}) by (persistentvolumeclaim, namespace, %s, kubernetes_node, kubernetes_name)) by (persistentvolumeclaim, storageclass, namespace, %s, volumename, kubernetes_node)`
  722. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), env.GetPromClusterLabel(), env.GetPromClusterFilter(), env.GetPromClusterLabel(), env.GetPromClusterLabel())
  723. cfg := pds.promConfig
  724. resolution = snapResolutionMinute(resolution)
  725. queryPVRequests := fmt.Sprintf(pvRequestsQuery, cfg.ClusterFilter, cfg.ClusterLabel, cfg.ClusterLabel, cfg.ClusterFilter, cfg.ClusterLabel, cfg.ClusterLabel)
  726. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  727. return ctx.QueryRange(queryPVRequests, start, end, resolution)
  728. }
  729. func (pds *PrometheusDataSource) QueryPVCAllocationOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  730. // pvcAllocationQuery yields the total byte-hour PVC allocation over the given window.
  731. // sum_over_time(each byte) = [byte*scrape] by metric *(scalar(avg(prometheus_target_interval_length_seconds)) = [seconds/scrape] / 60 / 60 = [hours/scrape] by pod
  732. const pvcAllocationQuery = `sum(sum_over_time(pod_pvc_allocation{%s}[%s])) by (%s, namespace, pod, persistentvolume, persistentvolumeclaim) * %f/60/60`
  733. // env.GetPromClusterFilter(), resStr, env.GetPromClusterLabel(), scrapeIntervalSeconds)
  734. cfg := pds.promConfig
  735. resolution = snapResolutionMinute(resolution)
  736. resMins := int64(resolution.Minutes())
  737. resStr := formatResolutionMinutes(resMins)
  738. scrapeIntervalSeconds := cfg.ScrapeInterval.Seconds()
  739. queryPVCAllocation := fmt.Sprintf(pvcAllocationQuery, cfg.ClusterFilter, resStr, cfg.ClusterLabel, scrapeIntervalSeconds)
  740. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  741. return ctx.QueryRange(queryPVCAllocation, start, end, resolution)
  742. }
  743. func (pds *PrometheusDataSource) QueryPVHourlyCostOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  744. const pvHourlyCostQuery = `avg_over_time(pv_hourly_cost{%s}[%s])`
  745. // env.GetPromClusterFilter(), resStr)
  746. cfg := pds.promConfig
  747. resolution = snapResolutionMinute(resolution)
  748. resMins := int64(resolution.Minutes())
  749. resStr := formatResolutionMinutes(resMins)
  750. queryPVHourlyCost := fmt.Sprintf(pvHourlyCostQuery, cfg.ClusterFilter, resStr)
  751. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  752. return ctx.QueryRange(queryPVHourlyCost, start, end, resolution)
  753. }
  754. func (pds *PrometheusDataSource) QueryNetworkInZoneOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  755. const netZoneRequestsQuery = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="true", %s}[%s] %s)) by (namespace,pod_name,%s) / 1024 / 1024 / 1024`
  756. // env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
  757. cfg := pds.promConfig
  758. resolution = snapResolutionMinute(resolution)
  759. resMins := int64(resolution.Minutes())
  760. resStr := formatResolutionMinutes(resMins)
  761. queryNetZoneRequests := fmt.Sprintf(netZoneRequestsQuery, cfg.ClusterFilter, resStr, "", cfg.ClusterLabel)
  762. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  763. return ctx.QueryRange(queryNetZoneRequests, start, end, resolution)
  764. }
  765. func (pds *PrometheusDataSource) QueryNetworkInRegionOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  766. const netRegionRequestsQuery = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="false", sameZone="false", sameRegion="false", %s}[%s] %s)) by (namespace,pod_name,%s) / 1024 / 1024 / 1024`
  767. // env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
  768. cfg := pds.promConfig
  769. resolution = snapResolutionMinute(resolution)
  770. resMins := int64(resolution.Minutes())
  771. resStr := formatResolutionMinutes(resMins)
  772. queryNetRegionRequests := fmt.Sprintf(netRegionRequestsQuery, cfg.ClusterFilter, resStr, "", cfg.ClusterLabel)
  773. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  774. return ctx.QueryRange(queryNetRegionRequests, start, end, resolution)
  775. }
  776. func (pds *PrometheusDataSource) QueryNetworkInternetOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  777. const netInternetRequestsQuery = `sum(increase(kubecost_pod_network_egress_bytes_total{internet="true", %s}[%s] %s)) by (namespace,pod_name,%s) / 1024 / 1024 / 1024`
  778. // env.GetPromClusterFilter(), resStr, "", env.GetPromClusterLabel())
  779. cfg := pds.promConfig
  780. resolution = snapResolutionMinute(resolution)
  781. resMins := int64(resolution.Minutes())
  782. resStr := formatResolutionMinutes(resMins)
  783. queryNetInternetRequests := fmt.Sprintf(netInternetRequestsQuery, cfg.ClusterFilter, resStr, "", cfg.ClusterLabel)
  784. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  785. return ctx.QueryRange(queryNetInternetRequests, start, end, resolution)
  786. }
  787. func (pds *PrometheusDataSource) QueryNamespaceLabelsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  788. const namespaceLabelsQuery = `avg_over_time(kube_namespace_labels{%s}[%s])`
  789. // env.GetPromClusterFilter(), resStr
  790. cfg := pds.promConfig
  791. resolution = snapResolutionMinute(resolution)
  792. resMins := int64(resolution.Minutes())
  793. resStr := formatResolutionMinutes(resMins)
  794. queryNamespaceLabels := fmt.Sprintf(namespaceLabelsQuery, cfg.ClusterFilter, resStr)
  795. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  796. return ctx.QueryRange(queryNamespaceLabels, start, end, resolution)
  797. }
  798. func (pds *PrometheusDataSource) QueryNamespaceAnnotationsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  799. const namespaceAnnotationsQuery = `avg_over_time(kube_namespace_annotations{%s}[%s])`
  800. // env.GetPromClusterFilter(), resStr
  801. cfg := pds.promConfig
  802. resolution = snapResolutionMinute(resolution)
  803. resMins := int64(resolution.Minutes())
  804. resStr := formatResolutionMinutes(resMins)
  805. queryNamespaceAnnotations := fmt.Sprintf(namespaceAnnotationsQuery, cfg.ClusterFilter, resStr)
  806. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  807. return ctx.QueryRange(queryNamespaceAnnotations, start, end, resolution)
  808. }
  809. func (pds *PrometheusDataSource) QueryPodLabelsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  810. const podLabelsQuery = `avg_over_time(kube_pod_labels{%s}[%s])`
  811. // env.GetPromClusterFilter(), resStr
  812. cfg := pds.promConfig
  813. resolution = snapResolutionMinute(resolution)
  814. resMins := int64(resolution.Minutes())
  815. resStr := formatResolutionMinutes(resMins)
  816. queryPodLabels := fmt.Sprintf(podLabelsQuery, cfg.ClusterFilter, resStr)
  817. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  818. return ctx.QueryRange(queryPodLabels, start, end, resolution)
  819. }
  820. func (pds *PrometheusDataSource) QueryPodAnnotationsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  821. const podAnnotationsQuery = `avg_over_time(kube_pod_annotations{%s}[%s])`
  822. // env.GetPromClusterFilter(), resStr
  823. cfg := pds.promConfig
  824. resolution = snapResolutionMinute(resolution)
  825. resMins := int64(resolution.Minutes())
  826. resStr := formatResolutionMinutes(resMins)
  827. queryPodAnnotations := fmt.Sprintf(podAnnotationsQuery, cfg.ClusterFilter, resStr)
  828. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  829. return ctx.QueryRange(queryPodAnnotations, start, end, resolution)
  830. }
  831. func (pds *PrometheusDataSource) QueryServiceLabelsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  832. const serviceLabelsQuery = `avg_over_time(service_selector_labels{%s}[%s])`
  833. // env.GetPromClusterFilter(), resStr
  834. cfg := pds.promConfig
  835. resolution = snapResolutionMinute(resolution)
  836. resMins := int64(resolution.Minutes())
  837. resStr := formatResolutionMinutes(resMins)
  838. queryServiceLabels := fmt.Sprintf(serviceLabelsQuery, cfg.ClusterFilter, resStr)
  839. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  840. return ctx.QueryRange(queryServiceLabels, start, end, resolution)
  841. }
  842. func (pds *PrometheusDataSource) QueryDeploymentLabelsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  843. const deploymentLabelsQuery = `avg_over_time(deployment_match_labels{%s}[%s])`
  844. // env.GetPromClusterFilter(), resStr
  845. cfg := pds.promConfig
  846. resolution = snapResolutionMinute(resolution)
  847. resMins := int64(resolution.Minutes())
  848. resStr := formatResolutionMinutes(resMins)
  849. queryDeploymentLabels := fmt.Sprintf(deploymentLabelsQuery, cfg.ClusterFilter, resStr)
  850. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  851. return ctx.QueryRange(queryDeploymentLabels, start, end, resolution)
  852. }
  853. func (pds *PrometheusDataSource) QueryStatefulsetLabelsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  854. const statefulsetLabelsQuery = `avg_over_time(statefulSet_match_labels{%s}[%s])`
  855. // env.GetPromClusterFilter(), resStr
  856. cfg := pds.promConfig
  857. resolution = snapResolutionMinute(resolution)
  858. resMins := int64(resolution.Minutes())
  859. resStr := formatResolutionMinutes(resMins)
  860. queryStatefulsetLabels := fmt.Sprintf(statefulsetLabelsQuery, cfg.ClusterFilter, resStr)
  861. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  862. return ctx.QueryRange(queryStatefulsetLabels, start, end, resolution)
  863. }
  864. func (pds *PrometheusDataSource) QueryPodJobsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  865. const podJobsQuery = `sum(kube_pod_owner{owner_kind="Job", %s}) by (namespace,pod,owner_name,%s)`
  866. // env.GetPromClusterFilter(), env.GetPromClusterLabel()
  867. cfg := pds.promConfig
  868. resolution = snapResolutionMinute(resolution)
  869. queryPodJobs := fmt.Sprintf(podJobsQuery, cfg.ClusterFilter, cfg.ClusterLabel)
  870. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  871. return ctx.QueryRange(queryPodJobs, start, end, resolution)
  872. }
  873. func (pds *PrometheusDataSource) QueryPodDaemonsetsOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  874. const podDaemonsetsQuery = `sum(kube_pod_owner{owner_kind="DaemonSet", %s}) by (namespace,pod,owner_name,%s)`
  875. // env.GetPromClusterFilter(), env.GetPromClusterLabel()
  876. cfg := pds.promConfig
  877. resolution = snapResolutionMinute(resolution)
  878. queryPodDaemonsets := fmt.Sprintf(podDaemonsetsQuery, cfg.ClusterFilter, cfg.ClusterLabel)
  879. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  880. return ctx.QueryRange(queryPodDaemonsets, start, end, resolution)
  881. }
  882. func (pds *PrometheusDataSource) QueryNormalizationOverTime(start, end time.Time, resolution time.Duration) source.QueryResultsChan {
  883. const normalizationQuery = `max(count_over_time(kube_pod_container_resource_requests{resource="memory", unit="byte", %s}[%s] %s))`
  884. // env.GetPromClusterFilter(), resStr, "")
  885. cfg := pds.promConfig
  886. resolution = snapResolutionMinute(resolution)
  887. resMins := int64(resolution.Minutes())
  888. resStr := formatResolutionMinutes(resMins)
  889. queryNormalization := fmt.Sprintf(normalizationQuery, cfg.ClusterFilter, resStr, "")
  890. ctx := pds.promContexts.NewNamedContext(ComputeCostDataRangeContextName)
  891. return ctx.QueryRange(queryNormalization, start, end, resolution)
  892. }
  893. func (pds *PrometheusDataSource) QueryPVCost(start, end time.Time) source.QueryResultsChan {
  894. const pvCostQuery = `avg(avg_over_time(pv_hourly_cost{%s}[%s])) by (%s, persistentvolume,provider_id)`
  895. durStr := timeutil.DurationString(end.Sub(start))
  896. if durStr == "" {
  897. panic("failed to parse duration string passed to QueryPVCost")
  898. }
  899. queryPVCost := fmt.Sprintf(pvCostQuery, pds.promConfig.ClusterFilter, durStr, pds.promConfig.ClusterLabel)
  900. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  901. return ctx.QueryAtTime(queryPVCost, end)
  902. }
  903. func (pds *PrometheusDataSource) QueryPVSize(start, end time.Time) source.QueryResultsChan {
  904. const pvSizeQuery = `avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[%s])) by (%s, persistentvolume)`
  905. cfg := pds.promConfig
  906. durStr := timeutil.DurationString(end.Sub(start))
  907. if durStr == "" {
  908. panic("failed to parse duration string passed to QueryPVCost")
  909. }
  910. queryPVSize := fmt.Sprintf(pvSizeQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  911. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  912. return ctx.QueryAtTime(queryPVSize, end)
  913. }
  914. func (pds *PrometheusDataSource) QueryPVStorageClass(start, end time.Time) source.QueryResultsChan {
  915. // `avg(avg_over_time(kubecost_pv_info{%s}[%s])) by (%s, persistentvolume, storageclass)`
  916. // , env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  917. const pvStorageSizeQuery = `avg(avg_over_time(kubecost_pv_info{%s}[%s])) by (%s, persistentvolume, storageclass)`
  918. cfg := pds.promConfig
  919. durStr := timeutil.DurationString(end.Sub(start))
  920. if durStr == "" {
  921. panic("failed to parse duration string passed to QueryPVStorageClass")
  922. }
  923. queryPVStorageClass := fmt.Sprintf(pvStorageSizeQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  924. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  925. return ctx.QueryAtTime(queryPVStorageClass, end)
  926. }
  927. func (pds *PrometheusDataSource) QueryPVUsedAverage(start, end time.Time) source.QueryResultsChan {
  928. // `avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  929. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  930. const pvUsedAverageQuery = `avg(avg_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  931. cfg := pds.promConfig
  932. durStr := timeutil.DurationString(end.Sub(start))
  933. if durStr == "" {
  934. panic("failed to parse duration string passed to QueryPVUsedAverage")
  935. }
  936. queryPVUsedAvg := fmt.Sprintf(pvUsedAverageQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  937. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  938. return ctx.QueryAtTime(queryPVUsedAvg, end)
  939. }
  940. func (pds *PrometheusDataSource) QueryPVUsedMax(start, end time.Time) source.QueryResultsChan {
  941. // `max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  942. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  943. const pvUsedMaxQuery = `max(max_over_time(kubelet_volume_stats_used_bytes{%s}[%s])) by (%s, persistentvolumeclaim, namespace)`
  944. cfg := pds.promConfig
  945. durStr := timeutil.DurationString(end.Sub(start))
  946. if durStr == "" {
  947. panic("failed to parse duration string passed to QueryPVUsedMax")
  948. }
  949. queryPVUsedMax := fmt.Sprintf(pvUsedMaxQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  950. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  951. return ctx.QueryAtTime(queryPVUsedMax, end)
  952. }
  953. func (pds *PrometheusDataSource) QueryPVCInfo(start, end time.Time) source.QueryResultsChan {
  954. // `avg(avg_over_time(kube_persistentvolumeclaim_info{%s}[%s])) by (%s, volumename, persistentvolumeclaim, namespace)`
  955. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  956. const pvcInfoQuery = `avg(avg_over_time(kube_persistentvolumeclaim_info{%s}[%s])) by (%s, volumename, persistentvolumeclaim, namespace)`
  957. cfg := pds.promConfig
  958. durStr := timeutil.DurationString(end.Sub(start))
  959. if durStr == "" {
  960. panic("failed to parse duration string passed to QueryPVCInfo")
  961. }
  962. queryPVCInfo := fmt.Sprintf(pvcInfoQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  963. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  964. return ctx.QueryAtTime(queryPVCInfo, end)
  965. }
  966. func (pds *PrometheusDataSource) QueryPVActiveMinutes(start, end time.Time) source.QueryResultsChan {
  967. // `avg(kube_persistentvolume_capacity_bytes{%s}) by (%s, persistentvolume)[%s:%dm]`
  968. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  969. const pvActiveMinsQuery = `avg(kube_persistentvolume_capacity_bytes{%s}) by (%s, persistentvolume)[%s:%dm]`
  970. cfg := pds.promConfig
  971. minsPerResolution := cfg.DataResolutionMinutes
  972. durStr := timeutil.DurationString(end.Sub(start))
  973. if durStr == "" {
  974. panic("failed to parse duration string passed to QueryPVActiveMinutes")
  975. }
  976. queryPVActiveMins := fmt.Sprintf(pvActiveMinsQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  977. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  978. return ctx.QueryAtTime(queryPVActiveMins, end)
  979. }
  980. func (pds *PrometheusDataSource) QueryLocalStorageCost(start, end time.Time) source.QueryResultsChan {
  981. // `sum_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  982. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  983. const localStorageCostQuery = `sum_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  984. cfg := pds.promConfig
  985. resolution := cfg.DataResolution
  986. durStr := timeutil.DurationString(end.Sub(start))
  987. if durStr == "" {
  988. panic("failed to parse duration string passed to QueryLocalStorageCost")
  989. }
  990. //Ensuring if data resolution is less than 60s default it to 1m
  991. var minsPerResolution int
  992. if minsPerResolution = int(resolution.Minutes()); int(resolution.Minutes()) == 0 {
  993. minsPerResolution = 1
  994. log.DedupedWarningf(3, "QueryLocalStorageCost: Configured resolution (%d seconds) is below the 60 seconds threshold. Overriding with 1 minute.", int(resolution.Seconds()))
  995. }
  996. // hourlyToCumulative is a scaling factor that, when multiplied by an
  997. // hourly value, converts it to a cumulative value; i.e. [$/hr] *
  998. // [min/res]*[hr/min] = [$/res]
  999. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  1000. costPerGBHr := 0.04 / 730.0
  1001. queryLocalStorageCost := fmt.Sprintf(localStorageCostQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  1002. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1003. return ctx.QueryAtTime(queryLocalStorageCost, end)
  1004. }
  1005. func (pds *PrometheusDataSource) QueryLocalStorageUsedCost(start, end time.Time) source.QueryResultsChan {
  1006. // `sum_over_time(sum(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  1007. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  1008. const localStorageUsedCostQuery = `sum_over_time(sum(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm]) / 1024 / 1024 / 1024 * %f * %f`
  1009. cfg := pds.promConfig
  1010. minsPerResolution := cfg.DataResolutionMinutes
  1011. durStr := timeutil.DurationString(end.Sub(start))
  1012. if durStr == "" {
  1013. panic("failed to parse duration string passed to QueryLocalStorageUsedCost")
  1014. }
  1015. // hourlyToCumulative is a scaling factor that, when multiplied by an
  1016. // hourly value, converts it to a cumulative value; i.e. [$/hr] *
  1017. // [min/res]*[hr/min] = [$/res]
  1018. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  1019. costPerGBHr := 0.04 / 730.0
  1020. queryLocalStorageUsedCost := fmt.Sprintf(localStorageUsedCostQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, hourlyToCumulative, costPerGBHr)
  1021. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1022. return ctx.QueryAtTime(queryLocalStorageUsedCost, end)
  1023. }
  1024. func (pds *PrometheusDataSource) QueryLocalStorageUsedAvg(start, end time.Time) source.QueryResultsChan {
  1025. // `avg(sum(avg_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  1026. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  1027. const localStorageUsedAvgQuery = `avg(sum(avg_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  1028. cfg := pds.promConfig
  1029. durStr := timeutil.DurationString(end.Sub(start))
  1030. if durStr == "" {
  1031. panic("failed to parse duration string passed to QueryLocalStorageUsedAvg")
  1032. }
  1033. queryLocalStorageUsedAvg := fmt.Sprintf(localStorageUsedAvgQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  1034. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1035. return ctx.QueryAtTime(queryLocalStorageUsedAvg, end)
  1036. }
  1037. func (pds *PrometheusDataSource) QueryLocalStorageUsedMax(start, end time.Time) source.QueryResultsChan {
  1038. // `max(sum(max_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  1039. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  1040. const localStorageUsedMaxQuery = `max(sum(max_over_time(container_fs_usage_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}[%s])) by (instance, device, %s, job)) by (instance, device, %s)`
  1041. cfg := pds.promConfig
  1042. durStr := timeutil.DurationString(end.Sub(start))
  1043. if durStr == "" {
  1044. panic("failed to parse duration string passed to QueryLocalStorageUsedMax")
  1045. }
  1046. queryLocalStorageUsedMax := fmt.Sprintf(localStorageUsedMaxQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  1047. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1048. return ctx.QueryAtTime(queryLocalStorageUsedMax, end)
  1049. }
  1050. func (pds *PrometheusDataSource) QueryLocalStorageBytes(start, end time.Time) source.QueryResultsChan {
  1051. // `avg_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm])`
  1052. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  1053. const localStorageBytesQuery = `avg_over_time(sum(container_fs_limit_bytes{device=~"/dev/(nvme|sda).*", id="/", %s}) by (instance, device, %s)[%s:%dm])`
  1054. cfg := pds.promConfig
  1055. minsPerResolution := cfg.DataResolutionMinutes
  1056. durStr := timeutil.DurationString(end.Sub(start))
  1057. if durStr == "" {
  1058. panic("failed to parse duration string passed to QueryLocalStorageBytes")
  1059. }
  1060. queryLocalStorageBytes := fmt.Sprintf(localStorageBytesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  1061. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1062. return ctx.QueryAtTime(queryLocalStorageBytes, end)
  1063. }
  1064. func (pds *PrometheusDataSource) QueryLocalStorageActiveMinutes(start, end time.Time) source.QueryResultsChan {
  1065. // `count(node_total_hourly_cost{%s}) by (%s, node)[%s:%dm]`
  1066. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  1067. const localStorageActiveMinutesQuery = `count(node_total_hourly_cost{%s}) by (%s, node)[%s:%dm]`
  1068. cfg := pds.promConfig
  1069. minsPerResolution := cfg.DataResolutionMinutes
  1070. durStr := timeutil.DurationString(end.Sub(start))
  1071. if durStr == "" {
  1072. panic("failed to parse duration string passed to QueryLocalStorageActiveMinutes")
  1073. }
  1074. queryLocalStorageActiveMins := fmt.Sprintf(localStorageActiveMinutesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  1075. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1076. return ctx.QueryAtTime(queryLocalStorageActiveMins, end)
  1077. }
  1078. func (pds *PrometheusDataSource) QueryLocalStorageBytesByProvider(provider string, start, end time.Time) source.QueryResultsChan {
  1079. var localStorageBytesQuery string
  1080. key := strings.ToLower(provider)
  1081. if f, ok := providerStorageQueries[key]; ok {
  1082. localStorageBytesQuery = f(pds.promConfig, start, end, false, false)
  1083. } else {
  1084. localStorageBytesQuery = ""
  1085. }
  1086. if localStorageBytesQuery == "" {
  1087. return newEmptyResult()
  1088. }
  1089. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1090. return ctx.QueryAtTime(localStorageBytesQuery, end)
  1091. }
  1092. func (pds *PrometheusDataSource) QueryLocalStorageUsedByProvider(provider string, start, end time.Time) source.QueryResultsChan {
  1093. var localStorageUsedQuery string
  1094. key := strings.ToLower(provider)
  1095. if f, ok := providerStorageQueries[key]; ok {
  1096. localStorageUsedQuery = f(pds.promConfig, start, end, false, true)
  1097. } else {
  1098. localStorageUsedQuery = ""
  1099. }
  1100. if localStorageUsedQuery == "" {
  1101. return newEmptyResult()
  1102. }
  1103. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1104. return ctx.QueryAtTime(localStorageUsedQuery, end)
  1105. }
  1106. func (pds *PrometheusDataSource) QueryNodeCPUHourlyCost(start, end time.Time) source.QueryResultsChan {
  1107. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1108. const nodeCPUHourlyCostQuery = `avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (%s, node, instance_type, provider_id)`
  1109. cfg := pds.promConfig
  1110. durStr := timeutil.DurationString(end.Sub(start))
  1111. if durStr == "" {
  1112. panic("failed to parse duration string passed to QueryNodeCPUHourlyCost")
  1113. }
  1114. queryNodeCPUHourlyCost := fmt.Sprintf(nodeCPUHourlyCostQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1115. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1116. return ctx.QueryAtTime(queryNodeCPUHourlyCost, end)
  1117. }
  1118. func (pds *PrometheusDataSource) QueryNodeCPUCoresCapacity(start, end time.Time) source.QueryResultsChan {
  1119. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1120. const nodeCPUCoresCapacityQuery = `avg(avg_over_time(kube_node_status_capacity_cpu_cores{%s}[%s])) by (%s, node)`
  1121. cfg := pds.promConfig
  1122. durStr := timeutil.DurationString(end.Sub(start))
  1123. if durStr == "" {
  1124. panic("failed to parse duration string passed to QueryNodeCPUCoresCapacity")
  1125. }
  1126. queryNodeCPUCoresCapacity := fmt.Sprintf(nodeCPUCoresCapacityQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1127. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1128. return ctx.QueryAtTime(queryNodeCPUCoresCapacity, end)
  1129. }
  1130. func (pds *PrometheusDataSource) QueryNodeCPUCoresAllocatable(start, end time.Time) source.QueryResultsChan {
  1131. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1132. const nodeCPUCoresAllocatableQuery = `avg(avg_over_time(kube_node_status_allocatable_cpu_cores{%s}[%s])) by (%s, node)`
  1133. cfg := pds.promConfig
  1134. durStr := timeutil.DurationString(end.Sub(start))
  1135. if durStr == "" {
  1136. panic("failed to parse duration string passed to QueryNodeCPUCoresAllocatable")
  1137. }
  1138. queryNodeCPUCoresAllocatable := fmt.Sprintf(nodeCPUCoresAllocatableQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1139. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1140. return ctx.QueryAtTime(queryNodeCPUCoresAllocatable, end)
  1141. }
  1142. func (pds *PrometheusDataSource) QueryNodeRAMHourlyCost(start, end time.Time) source.QueryResultsChan {
  1143. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1144. const nodeRAMHourlyCostQuery = `avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (%s, node, instance_type, provider_id) / 1024 / 1024 / 1024`
  1145. cfg := pds.promConfig
  1146. durStr := timeutil.DurationString(end.Sub(start))
  1147. if durStr == "" {
  1148. panic("failed to parse duration string passed to QueryNodeRAMHourlyCost")
  1149. }
  1150. queryNodeRAMHourlyCost := fmt.Sprintf(nodeRAMHourlyCostQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1151. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1152. return ctx.QueryAtTime(queryNodeRAMHourlyCost, end)
  1153. }
  1154. func (pds *PrometheusDataSource) QueryNodeRAMBytesCapacity(start, end time.Time) source.QueryResultsChan {
  1155. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1156. const nodeRAMBytesCapacityQuery = `avg(avg_over_time(kube_node_status_capacity_memory_bytes{%s}[%s])) by (%s, node)`
  1157. cfg := pds.promConfig
  1158. durStr := timeutil.DurationString(end.Sub(start))
  1159. if durStr == "" {
  1160. panic("failed to parse duration string passed to QueryNodeRAMBytesCapacity")
  1161. }
  1162. queryNodeRAMBytesCapacity := fmt.Sprintf(nodeRAMBytesCapacityQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1163. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1164. return ctx.QueryAtTime(queryNodeRAMBytesCapacity, end)
  1165. }
  1166. func (pds *PrometheusDataSource) QueryNodeRAMBytesAllocatable(start, end time.Time) source.QueryResultsChan {
  1167. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1168. const nodeRAMBytesAllocatableQuery = `avg(avg_over_time(kube_node_status_allocatable_memory_bytes{%s}[%s])) by (%s, node)`
  1169. cfg := pds.promConfig
  1170. durStr := timeutil.DurationString(end.Sub(start))
  1171. if durStr == "" {
  1172. panic("failed to parse duration string passed to QueryNodeRAMBytesAllocatable")
  1173. }
  1174. queryNodeRAMBytesAllocatable := fmt.Sprintf(nodeRAMBytesAllocatableQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1175. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1176. return ctx.QueryAtTime(queryNodeRAMBytesAllocatable, end)
  1177. }
  1178. func (pds *PrometheusDataSource) QueryNodeGPUCount(start, end time.Time) source.QueryResultsChan {
  1179. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1180. const nodeGPUCountQuery = `avg(avg_over_time(node_gpu_count{%s}[%s])) by (%s, node, provider_id)`
  1181. cfg := pds.promConfig
  1182. durStr := timeutil.DurationString(end.Sub(start))
  1183. if durStr == "" {
  1184. panic("failed to parse duration string passed to QueryNodeGPUCount")
  1185. }
  1186. queryNodeGPUCount := fmt.Sprintf(nodeGPUCountQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1187. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1188. return ctx.QueryAtTime(queryNodeGPUCount, end)
  1189. }
  1190. func (pds *PrometheusDataSource) QueryNodeGPUHourlyCost(start, end time.Time) source.QueryResultsChan {
  1191. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel())
  1192. const nodeGPUHourlyCostQuery = `avg(avg_over_time(node_gpu_hourly_cost{%s}[%s])) by (%s, node, instance_type, provider_id)`
  1193. cfg := pds.promConfig
  1194. durStr := timeutil.DurationString(end.Sub(start))
  1195. if durStr == "" {
  1196. panic("failed to parse duration string passed to QueryNodeGPUHourlyCost")
  1197. }
  1198. queryNodeGPUHourlyCost := fmt.Sprintf(nodeGPUHourlyCostQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1199. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1200. return ctx.QueryAtTime(queryNodeGPUHourlyCost, end)
  1201. }
  1202. func (pds *PrometheusDataSource) QueryNodeLabels(start, end time.Time) source.QueryResultsChan {
  1203. // env.GetPromClusterFilter(), durStr, minsPerResolution)
  1204. const labelsQuery = `count_over_time(kube_node_labels{%s}[%s:%dm])`
  1205. cfg := pds.promConfig
  1206. minsPerResolution := cfg.DataResolutionMinutes
  1207. durStr := timeutil.DurationString(end.Sub(start))
  1208. if durStr == "" {
  1209. panic("failed to parse duration string passed to QueryNodeLabels")
  1210. }
  1211. queryLabels := fmt.Sprintf(labelsQuery, cfg.ClusterFilter, durStr, minsPerResolution)
  1212. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1213. return ctx.QueryAtTime(queryLabels, end)
  1214. }
  1215. func (pds *PrometheusDataSource) QueryNodeActiveMinutes(start, end time.Time) source.QueryResultsChan {
  1216. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  1217. const activeMinsQuery = `avg(node_total_hourly_cost{%s}) by (node, %s, provider_id)[%s:%dm]`
  1218. cfg := pds.promConfig
  1219. minsPerResolution := cfg.DataResolutionMinutes
  1220. durStr := timeutil.DurationString(end.Sub(start))
  1221. if durStr == "" {
  1222. panic("failed to parse duration string passed to QueryNodeActiveMinutes")
  1223. }
  1224. queryActiveMins := fmt.Sprintf(activeMinsQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  1225. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1226. return ctx.QueryAtTime(queryActiveMins, end)
  1227. }
  1228. func (pds *PrometheusDataSource) QueryNodeIsSpot(start, end time.Time) source.QueryResultsChan {
  1229. // env.GetPromClusterFilter(), durStr, minsPerResolution)
  1230. const isSpotQuery = `avg_over_time(kubecost_node_is_spot{%s}[%s:%dm])`
  1231. cfg := pds.promConfig
  1232. minsPerResolution := cfg.DataResolutionMinutes
  1233. durStr := timeutil.DurationString(end.Sub(start))
  1234. if durStr == "" {
  1235. panic("failed to parse duration string passed to QueryNodeIsSpot")
  1236. }
  1237. queryIsSpot := fmt.Sprintf(isSpotQuery, cfg.ClusterFilter, durStr, minsPerResolution)
  1238. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1239. return ctx.QueryAtTime(queryIsSpot, end)
  1240. }
  1241. func (pds *PrometheusDataSource) QueryNodeCPUModeTotal(start, end time.Time) source.QueryResultsChan {
  1242. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel())
  1243. const nodeCPUModeTotalQuery = `sum(rate(node_cpu_seconds_total{%s}[%s:%dm])) by (kubernetes_node, %s, mode)`
  1244. cfg := pds.promConfig
  1245. minsPerResolution := cfg.DataResolutionMinutes
  1246. durStr := timeutil.DurationString(end.Sub(start))
  1247. if durStr == "" {
  1248. panic("failed to parse duration string passed to QueryNodeCPUModeTotal")
  1249. }
  1250. queryCPUModeTotal := fmt.Sprintf(nodeCPUModeTotalQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel)
  1251. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1252. return ctx.QueryAtTime(queryCPUModeTotal, end)
  1253. }
  1254. func (pds *PrometheusDataSource) QueryNodeCPUModePercent(start, end time.Time) source.QueryResultsChan {
  1255. const fmtQueryCPUModePct = `
  1256. sum(rate(node_cpu_seconds_total{%s}[%s])) by (%s, mode) / ignoring(mode)
  1257. group_left sum(rate(node_cpu_seconds_total{%s}[%s])) by (%s)
  1258. `
  1259. // env.GetPromClusterFilter(), windowStr, env.GetPromClusterLabel(), env.GetPromClusterFilter(), windowStr, fmtOffset, env.GetPromClusterLabel()
  1260. cfg := pds.promConfig
  1261. durStr := timeutil.DurationString(end.Sub(start))
  1262. if durStr == "" {
  1263. panic("failed to parse duration string passed to QueryNodeCPUModePercent")
  1264. }
  1265. queryCPUModePct := fmt.Sprintf(fmtQueryCPUModePct, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1266. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1267. return ctx.QueryAtTime(queryCPUModePct, end)
  1268. }
  1269. func (pds *PrometheusDataSource) QueryNodeRAMSystemPercent(start, end time.Time) source.QueryResultsChan {
  1270. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  1271. const nodeRAMSystemPctQuery = `sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`
  1272. cfg := pds.promConfig
  1273. minsPerResolution := cfg.DataResolutionMinutes
  1274. durStr := timeutil.DurationString(end.Sub(start))
  1275. if durStr == "" {
  1276. panic("failed to parse duration string passed to QueryNodeRAMSystemPercent")
  1277. }
  1278. queryRAMSystemPct := fmt.Sprintf(nodeRAMSystemPctQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterLabel)
  1279. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1280. return ctx.QueryAtTime(queryRAMSystemPct, end)
  1281. }
  1282. func (pds *PrometheusDataSource) QueryNodeRAMUserPercent(start, end time.Time) source.QueryResultsChan {
  1283. // env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterFilter(), durStr, minsPerResolution, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  1284. const nodeRAMUserPctQuery = `sum(sum_over_time(container_memory_working_set_bytes{container_name!="POD",container_name!="",namespace!="kube-system", %s}[%s:%dm])) by (instance, %s) / avg(label_replace(sum(sum_over_time(kube_node_status_capacity_memory_bytes{%s}[%s:%dm])) by (node, %s), "instance", "$1", "node", "(.*)")) by (instance, %s)`
  1285. cfg := pds.promConfig
  1286. minsPerResolution := cfg.DataResolutionMinutes
  1287. durStr := timeutil.DurationString(end.Sub(start))
  1288. if durStr == "" {
  1289. panic("failed to parse duration string passed to QueryNodeRAMUserPercent")
  1290. }
  1291. queryRAMUserPct := fmt.Sprintf(nodeRAMUserPctQuery, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, cfg.ClusterLabel)
  1292. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1293. return ctx.QueryAtTime(queryRAMUserPct, end)
  1294. }
  1295. func (pds *PrometheusDataSource) QueryLBCost(start, end time.Time) source.QueryResultsChan {
  1296. // env.GetPromClusterFilter(), durStr, env.GetPromClusterLabel()
  1297. const lbCostQuery = `avg(avg_over_time(kubecost_load_balancer_cost{%s}[%s])) by (namespace, service_name, %s, ingress_ip)`
  1298. cfg := pds.promConfig
  1299. durStr := timeutil.DurationString(end.Sub(start))
  1300. if durStr == "" {
  1301. panic("failed to parse duration string passed to QueryLBCost")
  1302. }
  1303. queryLBCost := fmt.Sprintf(lbCostQuery, cfg.ClusterFilter, durStr, cfg.ClusterLabel)
  1304. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1305. return ctx.QueryAtTime(queryLBCost, end)
  1306. }
  1307. func (pds *PrometheusDataSource) QueryLBActiveMinutes(start, end time.Time) source.QueryResultsChan {
  1308. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), durStr, minsPerResolution)
  1309. const lbActiveMinutesQuery = `avg(kubecost_load_balancer_cost{%s}) by (namespace, service_name, %s, ingress_ip)[%s:%dm]`
  1310. cfg := pds.promConfig
  1311. minsPerResolution := cfg.DataResolutionMinutes
  1312. durStr := timeutil.DurationString(end.Sub(start))
  1313. if durStr == "" {
  1314. panic("failed to parse duration string passed to QueryLBActiveMinutes")
  1315. }
  1316. queryLBActiveMins := fmt.Sprintf(lbActiveMinutesQuery, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution)
  1317. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1318. return ctx.QueryAtTime(queryLBActiveMins, end)
  1319. }
  1320. func (pds *PrometheusDataSource) QueryDataCount(start, end time.Time) source.QueryResultsChan {
  1321. const fmtQueryDataCount = `
  1322. count_over_time(sum(kube_node_status_capacity_cpu_cores{%s}) by (%s)[%s:%dm]) * %d
  1323. `
  1324. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, minsPerResolution)
  1325. cfg := pds.promConfig
  1326. minsPerResolution := cfg.DataResolutionMinutes
  1327. durStr := timeutil.DurationString(end.Sub(start))
  1328. if durStr == "" {
  1329. panic("failed to parse duration string passed to QueryDataCount")
  1330. }
  1331. queryDataCount := fmt.Sprintf(fmtQueryDataCount, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, minsPerResolution)
  1332. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1333. return ctx.QueryAtTime(queryDataCount, end)
  1334. }
  1335. func (pds *PrometheusDataSource) QueryTotalGPU(start, end time.Time) source.QueryResultsChan {
  1336. const fmtQueryTotalGPU = `
  1337. sum(
  1338. sum_over_time(node_gpu_hourly_cost{%s}[%s:%dm]) * %f
  1339. ) by (%s)
  1340. `
  1341. // env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, hourlyToCumulative, env.GetPromClusterLabel())
  1342. cfg := pds.promConfig
  1343. minsPerResolution := cfg.DataResolutionMinutes
  1344. durStr := timeutil.DurationString(end.Sub(start))
  1345. if durStr == "" {
  1346. panic("failed to parse duration string passed to QueryTotalGPU")
  1347. }
  1348. // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
  1349. // value, converts it to a cumulative value; i.e.
  1350. // [$/hr] * [min/res]*[hr/min] = [$/res]
  1351. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  1352. queryTotalGPU := fmt.Sprintf(fmtQueryTotalGPU, cfg.ClusterFilter, durStr, minsPerResolution, hourlyToCumulative, cfg.ClusterLabel)
  1353. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1354. return ctx.QueryAtTime(queryTotalGPU, end)
  1355. }
  1356. func (pds *PrometheusDataSource) QueryTotalCPU(start, end time.Time) source.QueryResultsChan {
  1357. const fmtQueryTotalCPU = `
  1358. sum(
  1359. sum_over_time(avg(kube_node_status_capacity_cpu_cores{%s}) by (node, %s)[%s:%dm]) *
  1360. avg(avg_over_time(node_cpu_hourly_cost{%s}[%s:%dm])) by (node, %s) * %f
  1361. ) by (%s)
  1362. `
  1363. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterFilter(), windowStr, minsPerResolution, fmtOffset, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel()
  1364. cfg := pds.promConfig
  1365. minsPerResolution := cfg.DataResolutionMinutes
  1366. durStr := timeutil.DurationString(end.Sub(start))
  1367. if durStr == "" {
  1368. panic("failed to parse duration string passed to QueryTotalCPU")
  1369. }
  1370. // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
  1371. // value, converts it to a cumulative value; i.e.
  1372. // [$/hr] * [min/res]*[hr/min] = [$/res]
  1373. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  1374. queryTotalCPU := fmt.Sprintf(fmtQueryTotalCPU, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, hourlyToCumulative, cfg.ClusterLabel)
  1375. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1376. return ctx.QueryAtTime(queryTotalCPU, end)
  1377. }
  1378. func (pds *PrometheusDataSource) QueryTotalRAM(start, end time.Time) source.QueryResultsChan {
  1379. const fmtQueryTotalRAM = `
  1380. sum(
  1381. sum_over_time(avg(kube_node_status_capacity_memory_bytes{%s}) by (node, %s)[%s:%dm]) / 1024 / 1024 / 1024 *
  1382. avg(avg_over_time(node_ram_hourly_cost{%s}[%s:%dm])) by (node, %s) * %f
  1383. ) by (%s)
  1384. `
  1385. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, env.GetPromClusterFilter(), windowStr, minsPerResolution, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
  1386. cfg := pds.promConfig
  1387. minsPerResolution := cfg.DataResolutionMinutes
  1388. durStr := timeutil.DurationString(end.Sub(start))
  1389. if durStr == "" {
  1390. panic("failed to parse duration string passed to QueryTotalRAM")
  1391. }
  1392. // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
  1393. // value, converts it to a cumulative value; i.e.
  1394. // [$/hr] * [min/res]*[hr/min] = [$/res]
  1395. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  1396. queryTotalRAM := fmt.Sprintf(fmtQueryTotalRAM, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, hourlyToCumulative, cfg.ClusterLabel)
  1397. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1398. return ctx.QueryAtTime(queryTotalRAM, end)
  1399. }
  1400. func (pds *PrometheusDataSource) QueryTotalStorage(start, end time.Time) source.QueryResultsChan {
  1401. const fmtQueryTotalStorage = `
  1402. sum(
  1403. sum_over_time(avg(kube_persistentvolume_capacity_bytes{%s}) by (persistentvolume, %s)[%s:%dm]) / 1024 / 1024 / 1024 *
  1404. avg(avg_over_time(pv_hourly_cost{%s}[%s:%dm])) by (persistentvolume, %s) * %f
  1405. ) by (%s)
  1406. `
  1407. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), windowStr, minsPerResolution, env.GetPromClusterFilter(), windowStr, minsPerResolution, env.GetPromClusterLabel(), hourlyToCumulative, env.GetPromClusterLabel())
  1408. cfg := pds.promConfig
  1409. minsPerResolution := cfg.DataResolutionMinutes
  1410. durStr := timeutil.DurationString(end.Sub(start))
  1411. if durStr == "" {
  1412. panic("failed to parse duration string passed to QueryTotalStorage")
  1413. }
  1414. // hourlyToCumulative is a scaling factor that, when multiplied by an hourly
  1415. // value, converts it to a cumulative value; i.e.
  1416. // [$/hr] * [min/res]*[hr/min] = [$/res]
  1417. hourlyToCumulative := float64(minsPerResolution) * (1.0 / 60.0)
  1418. queryTotalStorage := fmt.Sprintf(fmtQueryTotalStorage, cfg.ClusterFilter, cfg.ClusterLabel, durStr, minsPerResolution, cfg.ClusterFilter, durStr, minsPerResolution, cfg.ClusterLabel, hourlyToCumulative, cfg.ClusterLabel)
  1419. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1420. return ctx.QueryAtTime(queryTotalStorage, end)
  1421. }
  1422. func (pds *PrometheusDataSource) QueryClusterCores(start, end time.Time, step time.Duration) source.QueryResultsChan {
  1423. const queryClusterCores = `sum(
  1424. avg(avg_over_time(kube_node_status_capacity_cpu_cores{%s}[%s])) by (node, %s) * avg(avg_over_time(node_cpu_hourly_cost{%s}[%s])) by (node, %s) * 730 +
  1425. avg(avg_over_time(node_gpu_hourly_cost{%s}[%s])) by (node, %s) * 730
  1426. ) by (%s)`
  1427. // env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  1428. cfg := pds.promConfig
  1429. durStr := timeutil.DurationString(step)
  1430. if durStr == "" {
  1431. panic("failed to parse duration string passed to QueryClusterCores")
  1432. }
  1433. clusterCoresQuery := fmt.Sprintf(queryClusterCores, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  1434. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1435. return ctx.QueryRange(clusterCoresQuery, start, end, step)
  1436. }
  1437. func (pds *PrometheusDataSource) QueryClusterRAM(start, end time.Time, step time.Duration) source.QueryResultsChan {
  1438. const queryClusterRAM = `sum(
  1439. avg(avg_over_time(kube_node_status_capacity_memory_bytes{%s}[%s])) by (node, %s) / 1024 / 1024 / 1024 * avg(avg_over_time(node_ram_hourly_cost{%s}[%s])) by (node, %s) * 730
  1440. ) by (%s)`
  1441. // env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterLabel())
  1442. cfg := pds.promConfig
  1443. durStr := timeutil.DurationString(step)
  1444. if durStr == "" {
  1445. panic("failed to parse duration string passed to QueryClusterCores")
  1446. }
  1447. clusterRAMQuery := fmt.Sprintf(queryClusterRAM, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel)
  1448. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1449. return ctx.QueryRange(clusterRAMQuery, start, end, step)
  1450. }
  1451. func (pds *PrometheusDataSource) QueryClusterStorage(start, end time.Time, step time.Duration) source.QueryResultsChan {
  1452. return pds.QueryClusterStorageByProvider("", start, end, step)
  1453. }
  1454. func (pds *PrometheusDataSource) QueryClusterStorageByProvider(provider string, start, end time.Time, step time.Duration) source.QueryResultsChan {
  1455. const queryStorage = `sum(
  1456. avg(avg_over_time(pv_hourly_cost{%s}[%s])) by (persistentvolume, %s) * 730
  1457. * avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[%s])) by (persistentvolume, %s) / 1024 / 1024 / 1024
  1458. ) by (%s) %s`
  1459. // env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterFilter(), fmtWindow, env.GetPromClusterLabel(), env.GetPromClusterLabel(), localStorageQuery)
  1460. var localStorageQuery string
  1461. if provider != "" {
  1462. key := strings.ToLower(provider)
  1463. if f, ok := providerStorageQueries[key]; ok {
  1464. localStorageQuery = f(pds.promConfig, start, end, true, false)
  1465. } else {
  1466. localStorageQuery = ""
  1467. }
  1468. }
  1469. if localStorageQuery != "" {
  1470. localStorageQuery = fmt.Sprintf(" + %s", localStorageQuery)
  1471. }
  1472. cfg := pds.promConfig
  1473. durStr := timeutil.DurationString(step)
  1474. if durStr == "" {
  1475. panic("failed to parse duration string passed to QueryClusterCores")
  1476. }
  1477. clusterStorageQuery := fmt.Sprintf(queryStorage, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterFilter, durStr, cfg.ClusterLabel, cfg.ClusterLabel, localStorageQuery)
  1478. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1479. return ctx.QueryRange(clusterStorageQuery, start, end, step)
  1480. }
  1481. func (pds *PrometheusDataSource) QueryClusterTotal(start, end time.Time, step time.Duration) source.QueryResultsChan {
  1482. return pds.QueryClusterTotalByProvider("", start, end, step)
  1483. }
  1484. func (pds *PrometheusDataSource) QueryClusterTotalByProvider(provider string, start, end time.Time, step time.Duration) source.QueryResultsChan {
  1485. const queryTotal = `sum(avg(node_total_hourly_cost{%s}) by (node, %s)) * 730 +
  1486. sum(
  1487. avg(avg_over_time(pv_hourly_cost{%s}[1h])) by (persistentvolume, %s) * 730
  1488. * avg(avg_over_time(kube_persistentvolume_capacity_bytes{%s}[1h])) by (persistentvolume, %s) / 1024 / 1024 / 1024
  1489. ) by (%s) %s`
  1490. var localStorageQuery string
  1491. if provider != "" {
  1492. key := strings.ToLower(provider)
  1493. if f, ok := providerStorageQueries[key]; ok {
  1494. localStorageQuery = f(pds.promConfig, start, end, true, false)
  1495. } else {
  1496. localStorageQuery = ""
  1497. }
  1498. }
  1499. if localStorageQuery != "" {
  1500. localStorageQuery = fmt.Sprintf(" + %s", localStorageQuery)
  1501. }
  1502. cfg := pds.promConfig
  1503. durStr := timeutil.DurationString(step)
  1504. if durStr == "" {
  1505. panic("failed to parse duration string passed to QueryClusterTotalByProvider")
  1506. }
  1507. clusterTotalQuery := fmt.Sprintf(queryTotal, cfg.ClusterFilter, cfg.ClusterLabel, cfg.ClusterFilter, cfg.ClusterLabel, cfg.ClusterFilter, cfg.ClusterLabel, cfg.ClusterLabel, localStorageQuery)
  1508. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1509. return ctx.QueryRange(clusterTotalQuery, start, end, step)
  1510. }
  1511. func (pds *PrometheusDataSource) QueryClusterNodes(start, end time.Time, step time.Duration) source.QueryResultsChan {
  1512. return pds.QueryClusterNodesByProvider("", start, end, step)
  1513. }
  1514. func (pds *PrometheusDataSource) QueryClusterNodesByProvider(provider string, start, end time.Time, step time.Duration) source.QueryResultsChan {
  1515. const queryNodes = `sum(avg(node_total_hourly_cost{%s}) by (node, %s)) * 730 %s`
  1516. // env.GetPromClusterFilter(), env.GetPromClusterLabel(), localStorageQuery)
  1517. var localStorageQuery string
  1518. if provider != "" {
  1519. key := strings.ToLower(provider)
  1520. if f, ok := providerStorageQueries[key]; ok {
  1521. localStorageQuery = f(pds.promConfig, start, end, true, false)
  1522. } else {
  1523. localStorageQuery = ""
  1524. }
  1525. }
  1526. if localStorageQuery != "" {
  1527. localStorageQuery = fmt.Sprintf(" + %s", localStorageQuery)
  1528. }
  1529. cfg := pds.promConfig
  1530. durStr := timeutil.DurationString(step)
  1531. if durStr == "" {
  1532. panic("failed to parse duration string passed to QueryClusterNodesByProvider")
  1533. }
  1534. clusterNodesCostQuery := fmt.Sprintf(queryNodes, cfg.ClusterFilter, cfg.ClusterLabel, localStorageQuery)
  1535. ctx := pds.promContexts.NewNamedContext(ClusterContextName)
  1536. return ctx.QueryRange(clusterNodesCostQuery, start, end, step)
  1537. }
  1538. func newEmptyResult() source.QueryResultsChan {
  1539. ch := make(source.QueryResultsChan)
  1540. go func() {
  1541. results := source.NewQueryResults("")
  1542. ch <- results
  1543. }()
  1544. return ch
  1545. }
  1546. func snapResolutionMinute(res time.Duration) time.Duration {
  1547. resMins := int64(math.Trunc(res.Minutes()))
  1548. if resMins <= 0 {
  1549. resMins = 1
  1550. }
  1551. return time.Duration(resMins) * time.Minute
  1552. }
  1553. func formatResolutionMinutes(resMins int64) string {
  1554. if resMins%60 == 0 {
  1555. return fmt.Sprintf("%dh", resMins/60)
  1556. }
  1557. return fmt.Sprintf("%dm", resMins)
  1558. }