router.go 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157
  1. package costmodel
  2. import (
  3. "context"
  4. "encoding/json"
  5. "flag"
  6. "fmt"
  7. "net"
  8. "net/http"
  9. "os"
  10. "reflect"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "k8s.io/klog"
  15. "github.com/julienschmidt/httprouter"
  16. costAnalyzerCloud "github.com/kubecost/cost-model/cloud"
  17. "github.com/patrickmn/go-cache"
  18. prometheusClient "github.com/prometheus/client_golang/api"
  19. prometheusAPI "github.com/prometheus/client_golang/api/prometheus/v1"
  20. "github.com/prometheus/client_golang/prometheus"
  21. v1 "k8s.io/api/core/v1"
  22. "k8s.io/client-go/kubernetes"
  23. "k8s.io/client-go/rest"
  24. )
  25. const (
  26. prometheusServerEndpointEnvVar = "PROMETHEUS_SERVER_ENDPOINT"
  27. prometheusTroubleshootingEp = "http://docs.kubecost.com/custom-prom#troubleshoot"
  28. )
  29. var (
  30. // gitCommit is set by the build system
  31. gitCommit string
  32. )
  33. var Router = httprouter.New()
  34. var A Accesses
  35. type Accesses struct {
  36. PrometheusClient prometheusClient.Client
  37. ThanosClient prometheusClient.Client
  38. KubeClientSet kubernetes.Interface
  39. Cloud costAnalyzerCloud.Provider
  40. CPUPriceRecorder *prometheus.GaugeVec
  41. RAMPriceRecorder *prometheus.GaugeVec
  42. PersistentVolumePriceRecorder *prometheus.GaugeVec
  43. GPUPriceRecorder *prometheus.GaugeVec
  44. NodeTotalPriceRecorder *prometheus.GaugeVec
  45. RAMAllocationRecorder *prometheus.GaugeVec
  46. CPUAllocationRecorder *prometheus.GaugeVec
  47. GPUAllocationRecorder *prometheus.GaugeVec
  48. PVAllocationRecorder *prometheus.GaugeVec
  49. ContainerUptimeRecorder *prometheus.GaugeVec
  50. NetworkZoneEgressRecorder prometheus.Gauge
  51. NetworkRegionEgressRecorder prometheus.Gauge
  52. NetworkInternetEgressRecorder prometheus.Gauge
  53. ServiceSelectorRecorder *prometheus.GaugeVec
  54. DeploymentSelectorRecorder *prometheus.GaugeVec
  55. Model *CostModel
  56. Cache *cache.Cache
  57. }
  58. type DataEnvelope struct {
  59. Code int `json:"code"`
  60. Status string `json:"status"`
  61. Data interface{} `json:"data"`
  62. Message string `json:"message,omitempty"`
  63. }
  64. func normalizeTimeParam(param string) (string, error) {
  65. // convert days to hours
  66. if param[len(param)-1:] == "d" {
  67. count := param[:len(param)-1]
  68. val, err := strconv.ParseInt(count, 10, 64)
  69. if err != nil {
  70. return "", err
  71. }
  72. val = val * 24
  73. param = fmt.Sprintf("%dh", val)
  74. }
  75. return param, nil
  76. }
  77. func wrapDataWithMessage(data interface{}, err error, message string) []byte {
  78. var resp []byte
  79. if err != nil {
  80. klog.V(1).Infof("Error returned to client: %s", err.Error())
  81. resp, _ = json.Marshal(&DataEnvelope{
  82. Code: http.StatusInternalServerError,
  83. Status: "error",
  84. Message: err.Error(),
  85. Data: data,
  86. })
  87. } else {
  88. resp, _ = json.Marshal(&DataEnvelope{
  89. Code: http.StatusOK,
  90. Status: "success",
  91. Data: data,
  92. Message: message,
  93. })
  94. }
  95. return resp
  96. }
  97. func wrapData(data interface{}, err error) []byte {
  98. var resp []byte
  99. if err != nil {
  100. klog.V(1).Infof("Error returned to client: %s", err.Error())
  101. resp, _ = json.Marshal(&DataEnvelope{
  102. Code: http.StatusInternalServerError,
  103. Status: "error",
  104. Message: err.Error(),
  105. Data: data,
  106. })
  107. } else {
  108. resp, _ = json.Marshal(&DataEnvelope{
  109. Code: http.StatusOK,
  110. Status: "success",
  111. Data: data,
  112. })
  113. }
  114. return resp
  115. }
  116. // RefreshPricingData needs to be called when a new node joins the fleet, since we cache the relevant subsets of pricing data to avoid storing the whole thing.
  117. func (a *Accesses) RefreshPricingData(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  118. w.Header().Set("Content-Type", "application/json")
  119. w.Header().Set("Access-Control-Allow-Origin", "*")
  120. err := a.Cloud.DownloadPricingData()
  121. w.Write(wrapData(nil, err))
  122. }
  123. func filterFields(fields string, data map[string]*CostData) map[string]CostData {
  124. fs := strings.Split(fields, ",")
  125. fmap := make(map[string]bool)
  126. for _, f := range fs {
  127. fieldNameLower := strings.ToLower(f) // convert to go struct name by uppercasing first letter
  128. klog.V(1).Infof("to delete: %s", fieldNameLower)
  129. fmap[fieldNameLower] = true
  130. }
  131. filteredData := make(map[string]CostData)
  132. for cname, costdata := range data {
  133. s := reflect.TypeOf(*costdata)
  134. val := reflect.ValueOf(*costdata)
  135. costdata2 := CostData{}
  136. cd2 := reflect.New(reflect.Indirect(reflect.ValueOf(costdata2)).Type()).Elem()
  137. n := s.NumField()
  138. for i := 0; i < n; i++ {
  139. field := s.Field(i)
  140. value := val.Field(i)
  141. value2 := cd2.Field(i)
  142. if _, ok := fmap[strings.ToLower(field.Name)]; !ok {
  143. value2.Set(reflect.Value(value))
  144. }
  145. }
  146. filteredData[cname] = cd2.Interface().(CostData)
  147. }
  148. return filteredData
  149. }
  150. func (a *Accesses) CostDataModel(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  151. w.Header().Set("Content-Type", "application/json")
  152. w.Header().Set("Access-Control-Allow-Origin", "*")
  153. window := r.URL.Query().Get("timeWindow")
  154. offset := r.URL.Query().Get("offset")
  155. fields := r.URL.Query().Get("filterFields")
  156. namespace := r.URL.Query().Get("namespace")
  157. aggregationField := r.URL.Query().Get("aggregation")
  158. subfields := strings.Split(r.URL.Query().Get("aggregationSubfield"), ",")
  159. if offset != "" {
  160. offset = "offset " + offset
  161. }
  162. data, err := a.Model.ComputeCostData(a.PrometheusClient, a.KubeClientSet, a.Cloud, window, offset, namespace)
  163. if aggregationField != "" {
  164. c, err := a.Cloud.GetConfig()
  165. if err != nil {
  166. w.Write(wrapData(nil, err))
  167. return
  168. }
  169. discount, err := strconv.ParseFloat(c.Discount[:len(c.Discount)-1], 64)
  170. if err != nil {
  171. w.Write(wrapData(nil, err))
  172. return
  173. }
  174. discount = discount * 0.01
  175. dur, err := time.ParseDuration(window)
  176. if err != nil {
  177. w.Write(wrapData(nil, err))
  178. return
  179. }
  180. // dataCount is the number of time series data expected for the given interval,
  181. // which we compute because Prometheus time series vectors omit zero values.
  182. // This assumes hourly data, incremented by one to capture the 0th data point.
  183. dataCount := int64(dur.Hours()) + 1
  184. klog.V(1).Infof("for duration %s dataCount = %d", dur.String(), dataCount)
  185. opts := &AggregationOptions{
  186. DataCount: dataCount,
  187. Discount: discount,
  188. IdleCoefficients: make(map[string]float64),
  189. }
  190. agg := AggregateCostData(data, aggregationField, subfields, a.Cloud, opts)
  191. w.Write(wrapData(agg, nil))
  192. } else {
  193. if fields != "" {
  194. filteredData := filterFields(fields, data)
  195. w.Write(wrapData(filteredData, err))
  196. } else {
  197. w.Write(wrapData(data, err))
  198. }
  199. }
  200. }
  201. func (a *Accesses) ClusterCosts(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  202. w.Header().Set("Content-Type", "application/json")
  203. w.Header().Set("Access-Control-Allow-Origin", "*")
  204. window := r.URL.Query().Get("window")
  205. offset := r.URL.Query().Get("offset")
  206. data, err := ClusterCosts(a.PrometheusClient, a.Cloud, window, offset)
  207. w.Write(wrapData(data, err))
  208. }
  209. func (a *Accesses) ClusterCostsOverTime(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  210. w.Header().Set("Content-Type", "application/json")
  211. w.Header().Set("Access-Control-Allow-Origin", "*")
  212. start := r.URL.Query().Get("start")
  213. end := r.URL.Query().Get("end")
  214. window := r.URL.Query().Get("window")
  215. offset := r.URL.Query().Get("offset")
  216. data, err := ClusterCostsOverTime(a.PrometheusClient, a.Cloud, start, end, window, offset)
  217. w.Write(wrapData(data, err))
  218. }
  219. // AggregateCostModel handles HTTP requests to the aggregated cost model API, which can be parametrized
  220. // by time period using window and offset, aggregation field and subfield (e.g. grouping by label.app
  221. // using aggregation=label, aggregationSubfield=app), and filtered by namespace and cluster.
  222. func (a *Accesses) AggregateCostModel(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  223. w.Header().Set("Content-Type", "application/json")
  224. w.Header().Set("Access-Control-Allow-Origin", "*")
  225. window := r.URL.Query().Get("window")
  226. offset := r.URL.Query().Get("offset")
  227. namespace := r.URL.Query().Get("namespace")
  228. cluster := r.URL.Query().Get("cluster")
  229. field := r.URL.Query().Get("aggregation")
  230. subfieldStr := r.URL.Query().Get("aggregationSubfield")
  231. rate := r.URL.Query().Get("rate")
  232. allocateIdle := r.URL.Query().Get("allocateIdle") == "true"
  233. sharedNamespaces := r.URL.Query().Get("sharedNamespaces")
  234. sharedLabelNames := r.URL.Query().Get("sharedLabelNames")
  235. sharedLabelValues := r.URL.Query().Get("sharedLabelValues")
  236. remote := r.URL.Query().Get("remote")
  237. subfields := []string{}
  238. if len(subfieldStr) > 0 {
  239. subfields = strings.Split(r.URL.Query().Get("aggregationSubfield"), ",")
  240. }
  241. // timeSeries == true maintains the time series dimension of the data,
  242. // which by default gets summed over the entire interval
  243. includeTimeSeries := r.URL.Query().Get("timeSeries") == "true"
  244. // efficiency == true aggregates and returns usage and efficiency data
  245. includeEfficiency := r.URL.Query().Get("efficiency") == "true"
  246. // disableCache, if set to "true", tells this function to recompute and
  247. // cache the requested data
  248. disableCache := r.URL.Query().Get("disableCache") == "true" || allocateIdle
  249. // clearCache, if set to "true", tells this function to flush the cache,
  250. // then recompute and cache the requested data
  251. clearCache := r.URL.Query().Get("clearCache") == "true"
  252. // time window must be defined, whether by window and offset or by manually
  253. // setting the start and end times as ISO time strings
  254. var start, end string
  255. var dur time.Duration
  256. layout := "2006-01-02T15:04:05.000Z"
  257. if window == "" {
  258. start = r.URL.Query().Get("start")
  259. startTime, err := time.Parse(layout, start)
  260. if err != nil {
  261. w.WriteHeader(http.StatusBadRequest)
  262. w.Write(wrapData(nil, fmt.Errorf("Invalid start parameter: %s", start)))
  263. return
  264. }
  265. end = r.URL.Query().Get("end")
  266. endTime, err := time.Parse(layout, end)
  267. if err != nil {
  268. w.WriteHeader(http.StatusBadRequest)
  269. w.Write(wrapData(nil, fmt.Errorf("Invalid end parameter: %s", end)))
  270. return
  271. }
  272. dur = endTime.Sub(startTime)
  273. } else {
  274. // endTime defaults to the current time, unless an offset is explicity declared,
  275. // in which case it shifts endTime back by given duration
  276. endTime := time.Now()
  277. if offset != "" {
  278. o, err := time.ParseDuration(offset)
  279. if err != nil {
  280. klog.V(1).Infof("error parsing offset: %s", err)
  281. w.Write(wrapData(nil, err))
  282. return
  283. }
  284. endTime = endTime.Add(-1 * o)
  285. }
  286. if a.ThanosClient != nil {
  287. if endTime.After(time.Now().Add(-3 * time.Hour)) {
  288. klog.Infof("Setting end time backwards to first present data")
  289. endTime = time.Now().Add(-3 * time.Hour)
  290. }
  291. }
  292. // if window is defined in terms of days, convert to hours
  293. // e.g. convert "2d" to "48h"
  294. window, err := normalizeTimeParam(window)
  295. if err != nil {
  296. w.Write(wrapData(nil, err))
  297. return
  298. }
  299. // convert time window into start and end times, formatted
  300. // as ISO datetime strings
  301. dur, err = time.ParseDuration(window)
  302. if err != nil {
  303. w.Write(wrapData(nil, err))
  304. return
  305. }
  306. startTime := endTime.Add(-1 * dur)
  307. start = startTime.Format(layout)
  308. end = endTime.Format(layout)
  309. klog.V(1).Infof("start: %s, end: %s", start, end)
  310. }
  311. // dataCount is the number of time series data expected for the given interval,
  312. // which we compute because Prometheus time series vectors omit zero values.
  313. // This assumes hourly data, incremented by one to capture the 0th data point.
  314. dataCount := int64(dur.Hours()) + 1
  315. klog.V(1).Infof("for duration %s dataCount = %d", dur.String(), dataCount)
  316. // aggregation field is required
  317. if field == "" {
  318. w.WriteHeader(http.StatusBadRequest)
  319. w.Write(wrapData(nil, fmt.Errorf("Missing aggregation field parameter")))
  320. return
  321. }
  322. // aggregation subfield is required when aggregation field is "label"
  323. if field == "label" && len(subfields) == 0 {
  324. w.WriteHeader(http.StatusBadRequest)
  325. w.Write(wrapData(nil, fmt.Errorf("Missing aggregation subfield parameter for aggregation by label")))
  326. return
  327. }
  328. // enforce one of four available rate options
  329. if rate != "" && rate != "hourly" && rate != "daily" && rate != "monthly" {
  330. w.WriteHeader(http.StatusBadRequest)
  331. w.Write(wrapData(nil, fmt.Errorf("If set, rate parameter must be one of: 'hourly', 'daily', 'monthly'")))
  332. return
  333. }
  334. // clear cache prior to checking the cache so that a clearCache=true
  335. // request always returns a freshly computed value
  336. if clearCache {
  337. a.Cache.Flush()
  338. }
  339. // parametrize cache key by all request parameters
  340. aggKey := fmt.Sprintf("aggregate:%s:%s:%s:%s:%s:%s:%s:%t:%t:%t",
  341. window, offset, namespace, cluster, field, strings.Join(subfields, ","), rate, allocateIdle, includeTimeSeries, includeEfficiency)
  342. // check the cache for aggregated response; if cache is hit and not disabled, return response
  343. if result, found := a.Cache.Get(aggKey); found && !disableCache {
  344. w.Write(wrapDataWithMessage(result, nil, fmt.Sprintf("cache hit: %s", aggKey)))
  345. return
  346. }
  347. remoteAvailable := os.Getenv(remoteEnabled) == "true"
  348. remoteEnabled := false
  349. if remoteAvailable && remote != "false" {
  350. remoteEnabled = true
  351. }
  352. // Use Thanos Client if it exists (enabled) and remote flag set
  353. var pClient prometheusClient.Client
  354. if remote != "false" && a.ThanosClient != nil {
  355. pClient = a.ThanosClient
  356. } else {
  357. pClient = a.PrometheusClient
  358. }
  359. data, err := a.Model.ComputeCostDataRange(pClient, a.KubeClientSet, a.Cloud, start, end, "1h", namespace, cluster, remoteEnabled)
  360. if err != nil {
  361. klog.V(1).Infof("error computing cost data range: start=%s, end=%s, err=%s", start, end, err)
  362. w.Write(wrapData(nil, err))
  363. return
  364. }
  365. c, err := a.Cloud.GetConfig()
  366. if err != nil {
  367. w.Write(wrapData(nil, err))
  368. return
  369. }
  370. discount, err := strconv.ParseFloat(c.Discount[:len(c.Discount)-1], 64)
  371. if err != nil {
  372. w.Write(wrapData(nil, err))
  373. return
  374. }
  375. discount = discount * 0.01
  376. idleCoefficients := make(map[string]float64)
  377. if allocateIdle {
  378. windowStr := fmt.Sprintf("%dh", int(dur.Hours()))
  379. if a.ThanosClient != nil {
  380. klog.Infof("Setting offset to 3h")
  381. offset = "3h"
  382. }
  383. idleCoefficients, err = ComputeIdleCoefficient(data, pClient, a.Cloud, discount, windowStr, offset)
  384. if err != nil {
  385. klog.V(1).Infof("error computing idle coefficient: windowString=%s, offset=%s, err=%s", windowStr, offset, err)
  386. w.Write(wrapData(nil, err))
  387. return
  388. }
  389. }
  390. sn := []string{}
  391. sln := []string{}
  392. slv := []string{}
  393. if sharedNamespaces != "" {
  394. sn = strings.Split(sharedNamespaces, ",")
  395. }
  396. if sharedLabelNames != "" {
  397. sln = strings.Split(sharedLabelNames, ",")
  398. slv = strings.Split(sharedLabelValues, ",")
  399. if len(sln) != len(slv) || slv[0] == "" {
  400. w.Write(wrapData(nil, fmt.Errorf("Supply exacly one label value per label name")))
  401. return
  402. }
  403. }
  404. var sr *SharedResourceInfo
  405. if len(sn) > 0 || len(sln) > 0 {
  406. sr = NewSharedResourceInfo(true, sn, sln, slv)
  407. }
  408. for cid, idleCoefficient := range idleCoefficients {
  409. klog.Infof("Idle Coeff: %s: %f", cid, idleCoefficient)
  410. }
  411. // aggregate cost model data by given fields and cache the result for the default expiration
  412. opts := &AggregationOptions{
  413. DataCount: dataCount,
  414. Discount: discount,
  415. IdleCoefficients: idleCoefficients,
  416. IncludeEfficiency: includeEfficiency,
  417. IncludeTimeSeries: includeTimeSeries,
  418. Rate: rate,
  419. SharedResourceInfo: sr,
  420. }
  421. result := AggregateCostData(data, field, subfields, a.Cloud, opts)
  422. a.Cache.Set(aggKey, result, cache.DefaultExpiration)
  423. w.Write(wrapDataWithMessage(result, nil, fmt.Sprintf("cache miss: %s", aggKey)))
  424. }
  425. func (a *Accesses) CostDataModelRange(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  426. w.Header().Set("Content-Type", "application/json")
  427. w.Header().Set("Access-Control-Allow-Origin", "*")
  428. start := r.URL.Query().Get("start")
  429. end := r.URL.Query().Get("end")
  430. window := r.URL.Query().Get("window")
  431. fields := r.URL.Query().Get("filterFields")
  432. namespace := r.URL.Query().Get("namespace")
  433. cluster := r.URL.Query().Get("cluster")
  434. aggregationField := r.URL.Query().Get("aggregation")
  435. subfields := strings.Split(r.URL.Query().Get("aggregationSubfield"), ",")
  436. remote := r.URL.Query().Get("remote")
  437. remoteAvailable := os.Getenv(remoteEnabled)
  438. remoteEnabled := false
  439. if remoteAvailable == "true" && remote != "false" {
  440. remoteEnabled = true
  441. }
  442. // Use Thanos Client if it exists (enabled) and remote flag set
  443. var pClient prometheusClient.Client
  444. if remote != "false" && a.ThanosClient != nil {
  445. pClient = a.ThanosClient
  446. } else {
  447. pClient = a.PrometheusClient
  448. }
  449. data, err := a.Model.ComputeCostDataRange(pClient, a.KubeClientSet, a.Cloud, start, end, window, namespace, cluster, remoteEnabled)
  450. if err != nil {
  451. w.Write(wrapData(nil, err))
  452. }
  453. if aggregationField != "" {
  454. c, err := a.Cloud.GetConfig()
  455. if err != nil {
  456. w.Write(wrapData(nil, err))
  457. return
  458. }
  459. discount, err := strconv.ParseFloat(c.Discount[:len(c.Discount)-1], 64)
  460. if err != nil {
  461. w.Write(wrapData(nil, err))
  462. }
  463. discount = discount * 0.01
  464. layout := "2006-01-02T15:04:05.000Z"
  465. startTime, err := time.Parse(layout, start)
  466. if err != nil {
  467. w.Write(wrapData(nil, err))
  468. return
  469. }
  470. endTime, err := time.Parse(layout, end)
  471. if err != nil {
  472. w.Write(wrapData(nil, err))
  473. return
  474. }
  475. dur := endTime.Sub(startTime)
  476. if err != nil {
  477. w.Write(wrapData(nil, err))
  478. return
  479. }
  480. windowHrs, err := strconv.ParseInt(window[:len(window)-1], 10, 64)
  481. // dataCount is the number of time series data expected for the given interval,
  482. // which we compute because Prometheus time series vectors omit zero values.
  483. // This assumes hourly data, incremented by one to capture the 0th data point.
  484. dataCount := (int64(dur.Hours()) / windowHrs) + 1
  485. klog.V(1).Infof("for duration %s dataCount = %d", dur.String(), dataCount)
  486. opts := &AggregationOptions{
  487. DataCount: dataCount,
  488. Discount: discount,
  489. IdleCoefficients: make(map[string]float64),
  490. }
  491. agg := AggregateCostData(data, aggregationField, subfields, a.Cloud, opts)
  492. w.Write(wrapData(agg, nil))
  493. } else {
  494. if fields != "" {
  495. filteredData := filterFields(fields, data)
  496. w.Write(wrapData(filteredData, err))
  497. } else {
  498. w.Write(wrapData(data, err))
  499. }
  500. }
  501. }
  502. // CostDataModelRangeLarge is experimental multi-cluster and long-term data storage in SQL support.
  503. func (a *Accesses) CostDataModelRangeLarge(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  504. w.Header().Set("Content-Type", "application/json")
  505. w.Header().Set("Access-Control-Allow-Origin", "*")
  506. startString := r.URL.Query().Get("start")
  507. endString := r.URL.Query().Get("end")
  508. windowString := r.URL.Query().Get("window")
  509. layout := "2006-01-02T15:04:05.000Z"
  510. var start time.Time
  511. var end time.Time
  512. var err error
  513. if windowString == "" {
  514. windowString = "1h"
  515. }
  516. if startString != "" {
  517. start, err = time.Parse(layout, startString)
  518. if err != nil {
  519. klog.V(1).Infof("Error parsing time " + startString + ". Error: " + err.Error())
  520. w.Write(wrapData(nil, err))
  521. }
  522. } else {
  523. window, err := time.ParseDuration(windowString)
  524. if err != nil {
  525. w.Write(wrapData(nil, fmt.Errorf("Invalid duration '%s'", windowString)))
  526. }
  527. start = time.Now().Add(-2 * window)
  528. }
  529. if endString != "" {
  530. end, err = time.Parse(layout, endString)
  531. if err != nil {
  532. klog.V(1).Infof("Error parsing time " + endString + ". Error: " + err.Error())
  533. w.Write(wrapData(nil, err))
  534. }
  535. } else {
  536. end = time.Now()
  537. }
  538. remoteLayout := "2006-01-02T15:04:05Z"
  539. remoteStartStr := start.Format(remoteLayout)
  540. remoteEndStr := end.Format(remoteLayout)
  541. klog.V(1).Infof("Using remote database for query from %s to %s with window %s", startString, endString, windowString)
  542. data, err := CostDataRangeFromSQL("", "", windowString, remoteStartStr, remoteEndStr)
  543. w.Write(wrapData(data, err))
  544. }
  545. func (a *Accesses) OutofClusterCosts(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  546. w.Header().Set("Content-Type", "application/json")
  547. w.Header().Set("Access-Control-Allow-Origin", "*")
  548. start := r.URL.Query().Get("start")
  549. end := r.URL.Query().Get("end")
  550. aggregator := r.URL.Query().Get("aggregator")
  551. customAggregation := r.URL.Query().Get("customAggregation")
  552. var data []*costAnalyzerCloud.OutOfClusterAllocation
  553. var err error
  554. if customAggregation != "" {
  555. data, err = a.Cloud.ExternalAllocations(start, end, customAggregation)
  556. } else {
  557. data, err = a.Cloud.ExternalAllocations(start, end, "kubernetes_"+aggregator)
  558. }
  559. w.Write(wrapData(data, err))
  560. }
  561. func (p *Accesses) GetAllNodePricing(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  562. w.Header().Set("Content-Type", "application/json")
  563. w.Header().Set("Access-Control-Allow-Origin", "*")
  564. data, err := p.Cloud.AllNodePricing()
  565. w.Write(wrapData(data, err))
  566. }
  567. func (p *Accesses) GetConfigs(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  568. w.Header().Set("Content-Type", "application/json")
  569. w.Header().Set("Access-Control-Allow-Origin", "*")
  570. data, err := p.Cloud.GetConfig()
  571. w.Write(wrapData(data, err))
  572. }
  573. func (p *Accesses) UpdateSpotInfoConfigs(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  574. w.Header().Set("Content-Type", "application/json")
  575. w.Header().Set("Access-Control-Allow-Origin", "*")
  576. data, err := p.Cloud.UpdateConfig(r.Body, costAnalyzerCloud.SpotInfoUpdateType)
  577. if err != nil {
  578. w.Write(wrapData(data, err))
  579. return
  580. }
  581. w.Write(wrapData(data, err))
  582. err = p.Cloud.DownloadPricingData()
  583. if err != nil {
  584. klog.V(1).Infof("Error redownloading data on config update: %s", err.Error())
  585. }
  586. return
  587. }
  588. func (p *Accesses) UpdateAthenaInfoConfigs(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  589. w.Header().Set("Content-Type", "application/json")
  590. w.Header().Set("Access-Control-Allow-Origin", "*")
  591. data, err := p.Cloud.UpdateConfig(r.Body, costAnalyzerCloud.AthenaInfoUpdateType)
  592. if err != nil {
  593. w.Write(wrapData(data, err))
  594. return
  595. }
  596. w.Write(wrapData(data, err))
  597. return
  598. }
  599. func (p *Accesses) UpdateBigQueryInfoConfigs(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  600. w.Header().Set("Content-Type", "application/json")
  601. w.Header().Set("Access-Control-Allow-Origin", "*")
  602. data, err := p.Cloud.UpdateConfig(r.Body, costAnalyzerCloud.BigqueryUpdateType)
  603. if err != nil {
  604. w.Write(wrapData(data, err))
  605. return
  606. }
  607. w.Write(wrapData(data, err))
  608. return
  609. }
  610. func (p *Accesses) UpdateConfigByKey(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  611. w.Header().Set("Content-Type", "application/json")
  612. w.Header().Set("Access-Control-Allow-Origin", "*")
  613. data, err := p.Cloud.UpdateConfig(r.Body, "")
  614. if err != nil {
  615. w.Write(wrapData(data, err))
  616. return
  617. }
  618. w.Write(wrapData(data, err))
  619. return
  620. }
  621. func (p *Accesses) ManagementPlatform(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  622. w.Header().Set("Content-Type", "application/json")
  623. w.Header().Set("Access-Control-Allow-Origin", "*")
  624. data, err := p.Cloud.GetManagementPlatform()
  625. if err != nil {
  626. w.Write(wrapData(data, err))
  627. return
  628. }
  629. w.Write(wrapData(data, err))
  630. return
  631. }
  632. func (p *Accesses) ClusterInfo(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  633. w.Header().Set("Content-Type", "application/json")
  634. w.Header().Set("Access-Control-Allow-Origin", "*")
  635. data, err := p.Cloud.ClusterInfo()
  636. w.Write(wrapData(data, err))
  637. }
  638. func Healthz(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  639. w.WriteHeader(200)
  640. w.Header().Set("Content-Length", "0")
  641. w.Header().Set("Content-Type", "text/plain")
  642. }
  643. func (p *Accesses) GetPrometheusMetadata(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  644. w.Header().Set("Content-Type", "application/json")
  645. w.Header().Set("Access-Control-Allow-Origin", "*")
  646. w.Write(wrapData(ValidatePrometheus(p.PrometheusClient, false)))
  647. }
  648. func (p *Accesses) ContainerUptimes(w http.ResponseWriter, _ *http.Request, _ httprouter.Params) {
  649. w.Header().Set("Content-Type", "application/json")
  650. w.Header().Set("Access-Control-Allow-Origin", "*")
  651. res, err := ComputeUptimes(p.PrometheusClient)
  652. w.Write(wrapData(res, err))
  653. }
  654. func (a *Accesses) recordPrices() {
  655. go func() {
  656. containerSeen := make(map[string]bool)
  657. nodeSeen := make(map[string]bool)
  658. pvSeen := make(map[string]bool)
  659. pvcSeen := make(map[string]bool)
  660. getKeyFromLabelStrings := func(labels ...string) string {
  661. return strings.Join(labels, ",")
  662. }
  663. getLabelStringsFromKey := func(key string) []string {
  664. return strings.Split(key, ",")
  665. }
  666. for {
  667. klog.V(4).Info("Recording prices...")
  668. podlist := a.Model.Cache.GetAllPods()
  669. podStatus := make(map[string]v1.PodPhase)
  670. for _, pod := range podlist {
  671. podStatus[pod.Name] = pod.Status.Phase
  672. }
  673. // Record network pricing at global scope
  674. networkCosts, err := a.Cloud.NetworkPricing()
  675. if err != nil {
  676. klog.V(4).Infof("Failed to retrieve network costs: %s", err.Error())
  677. } else {
  678. a.NetworkZoneEgressRecorder.Set(networkCosts.ZoneNetworkEgressCost)
  679. a.NetworkRegionEgressRecorder.Set(networkCosts.RegionNetworkEgressCost)
  680. a.NetworkInternetEgressRecorder.Set(networkCosts.InternetNetworkEgressCost)
  681. }
  682. data, err := a.Model.ComputeCostData(a.PrometheusClient, a.KubeClientSet, a.Cloud, "2m", "", "")
  683. if err != nil {
  684. klog.V(1).Info("Error in price recording: " + err.Error())
  685. // zero the for loop so the time.Sleep will still work
  686. data = map[string]*CostData{}
  687. }
  688. for _, costs := range data {
  689. nodeName := costs.NodeName
  690. node := costs.NodeData
  691. if node == nil {
  692. klog.V(4).Infof("Skipping Node \"%s\" due to missing Node Data costs", nodeName)
  693. continue
  694. }
  695. cpuCost, _ := strconv.ParseFloat(node.VCPUCost, 64)
  696. cpu, _ := strconv.ParseFloat(node.VCPU, 64)
  697. ramCost, _ := strconv.ParseFloat(node.RAMCost, 64)
  698. ram, _ := strconv.ParseFloat(node.RAMBytes, 64)
  699. gpu, _ := strconv.ParseFloat(node.GPU, 64)
  700. gpuCost, _ := strconv.ParseFloat(node.GPUCost, 64)
  701. totalCost := cpu*cpuCost + ramCost*(ram/1024/1024/1024) + gpu*gpuCost
  702. namespace := costs.Namespace
  703. podName := costs.PodName
  704. containerName := costs.Name
  705. if costs.PVCData != nil {
  706. for _, pvc := range costs.PVCData {
  707. if pvc.Volume != nil {
  708. a.PVAllocationRecorder.WithLabelValues(namespace, podName, pvc.Claim, pvc.VolumeName).Set(pvc.Values[0].Value)
  709. labelKey := getKeyFromLabelStrings(namespace, podName, pvc.Claim, pvc.VolumeName)
  710. pvcSeen[labelKey] = true
  711. }
  712. }
  713. }
  714. a.CPUPriceRecorder.WithLabelValues(nodeName, nodeName).Set(cpuCost)
  715. a.RAMPriceRecorder.WithLabelValues(nodeName, nodeName).Set(ramCost)
  716. a.GPUPriceRecorder.WithLabelValues(nodeName, nodeName).Set(gpuCost)
  717. a.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName).Set(totalCost)
  718. labelKey := getKeyFromLabelStrings(nodeName, nodeName)
  719. nodeSeen[labelKey] = true
  720. if len(costs.RAMAllocation) > 0 {
  721. a.RAMAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.RAMAllocation[0].Value)
  722. }
  723. if len(costs.CPUAllocation) > 0 {
  724. a.CPUAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.CPUAllocation[0].Value)
  725. }
  726. if len(costs.GPUReq) > 0 {
  727. // allocation here is set to the request because shared GPU usage not yet supported.
  728. a.GPUAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.GPUReq[0].Value)
  729. }
  730. labelKey = getKeyFromLabelStrings(namespace, podName, containerName, nodeName, nodeName)
  731. if podStatus[podName] == v1.PodRunning { // Only report data for current pods
  732. containerSeen[labelKey] = true
  733. } else {
  734. containerSeen[labelKey] = false
  735. }
  736. storageClasses := a.Model.Cache.GetAllStorageClasses()
  737. storageClassMap := make(map[string]map[string]string)
  738. for _, storageClass := range storageClasses {
  739. params := storageClass.Parameters
  740. storageClassMap[storageClass.ObjectMeta.Name] = params
  741. if storageClass.GetAnnotations()["storageclass.kubernetes.io/is-default-class"] == "true" || storageClass.GetAnnotations()["storageclass.beta.kubernetes.io/is-default-class"] == "true" {
  742. storageClassMap["default"] = params
  743. storageClassMap[""] = params
  744. }
  745. }
  746. pvs := a.Model.Cache.GetAllPersistentVolumes()
  747. for _, pv := range pvs {
  748. parameters, ok := storageClassMap[pv.Spec.StorageClassName]
  749. if !ok {
  750. klog.V(4).Infof("Unable to find parameters for storage class \"%s\". Does pv \"%s\" have a storageClassName?", pv.Spec.StorageClassName, pv.Name)
  751. }
  752. cacPv := &costAnalyzerCloud.PV{
  753. Class: pv.Spec.StorageClassName,
  754. Region: pv.Labels[v1.LabelZoneRegion],
  755. Parameters: parameters,
  756. }
  757. GetPVCost(cacPv, pv, a.Cloud)
  758. c, _ := strconv.ParseFloat(cacPv.Cost, 64)
  759. a.PersistentVolumePriceRecorder.WithLabelValues(pv.Name, pv.Name).Set(c)
  760. labelKey := getKeyFromLabelStrings(pv.Name, pv.Name)
  761. pvSeen[labelKey] = true
  762. }
  763. containerUptime, _ := ComputeUptimes(a.PrometheusClient)
  764. for key, uptime := range containerUptime {
  765. container, _ := NewContainerMetricFromKey(key)
  766. a.ContainerUptimeRecorder.WithLabelValues(container.Namespace, container.PodName, container.ContainerName).Set(uptime)
  767. }
  768. }
  769. for labelString, seen := range nodeSeen {
  770. if !seen {
  771. labels := getLabelStringsFromKey(labelString)
  772. a.NodeTotalPriceRecorder.DeleteLabelValues(labels...)
  773. a.CPUPriceRecorder.DeleteLabelValues(labels...)
  774. a.GPUPriceRecorder.DeleteLabelValues(labels...)
  775. a.RAMPriceRecorder.DeleteLabelValues(labels...)
  776. delete(nodeSeen, labelString)
  777. }
  778. nodeSeen[labelString] = false
  779. }
  780. for labelString, seen := range containerSeen {
  781. if !seen {
  782. labels := getLabelStringsFromKey(labelString)
  783. a.RAMAllocationRecorder.DeleteLabelValues(labels...)
  784. a.CPUAllocationRecorder.DeleteLabelValues(labels...)
  785. a.GPUAllocationRecorder.DeleteLabelValues(labels...)
  786. a.ContainerUptimeRecorder.DeleteLabelValues(labels...)
  787. delete(containerSeen, labelString)
  788. }
  789. containerSeen[labelString] = false
  790. }
  791. for labelString, seen := range pvSeen {
  792. if !seen {
  793. labels := getLabelStringsFromKey(labelString)
  794. a.PersistentVolumePriceRecorder.DeleteLabelValues(labels...)
  795. delete(pvSeen, labelString)
  796. }
  797. pvSeen[labelString] = false
  798. }
  799. for labelString, seen := range pvcSeen {
  800. if !seen {
  801. labels := getLabelStringsFromKey(labelString)
  802. a.PVAllocationRecorder.DeleteLabelValues(labels...)
  803. delete(pvcSeen, labelString)
  804. }
  805. pvcSeen[labelString] = false
  806. }
  807. time.Sleep(time.Minute)
  808. }
  809. }()
  810. }
  811. func init() {
  812. klog.InitFlags(nil)
  813. flag.Set("v", "3")
  814. flag.Parse()
  815. klog.V(1).Infof("Starting cost-model (git commit \"%s\")", gitCommit)
  816. address := os.Getenv(prometheusServerEndpointEnvVar)
  817. if address == "" {
  818. klog.Fatalf("No address for prometheus set in $%s. Aborting.", prometheusServerEndpointEnvVar)
  819. }
  820. var LongTimeoutRoundTripper http.RoundTripper = &http.Transport{ // may be necessary for long prometheus queries. TODO: make this configurable
  821. Proxy: http.ProxyFromEnvironment,
  822. DialContext: (&net.Dialer{
  823. Timeout: 120 * time.Second,
  824. KeepAlive: 120 * time.Second,
  825. }).DialContext,
  826. TLSHandshakeTimeout: 10 * time.Second,
  827. }
  828. pc := prometheusClient.Config{
  829. Address: address,
  830. RoundTripper: LongTimeoutRoundTripper,
  831. }
  832. promCli, _ := prometheusClient.NewClient(pc)
  833. api := prometheusAPI.NewAPI(promCli)
  834. _, err := api.Config(context.Background())
  835. if err != nil {
  836. klog.Fatalf("No valid prometheus config file at %s. Error: %s . Troubleshooting help available at: %s", address, err.Error(), prometheusTroubleshootingEp)
  837. }
  838. klog.V(1).Info("Success: retrieved a prometheus config file from: " + address)
  839. _, err = ValidatePrometheus(promCli, false)
  840. if err != nil {
  841. klog.Fatalf("Failed to query prometheus at %s. Error: %s . Troubleshooting help available at: %s", address, err.Error(), prometheusTroubleshootingEp)
  842. }
  843. klog.V(1).Info("Success: retrieved the 'up' query against prometheus at: " + address)
  844. // Kubernetes API setup
  845. kc, err := rest.InClusterConfig()
  846. if err != nil {
  847. panic(err.Error())
  848. }
  849. kubeClientset, err := kubernetes.NewForConfig(kc)
  850. if err != nil {
  851. panic(err.Error())
  852. }
  853. cloudProviderKey := os.Getenv("CLOUD_PROVIDER_API_KEY")
  854. cloudProvider, err := costAnalyzerCloud.NewProvider(kubeClientset, cloudProviderKey)
  855. if err != nil {
  856. panic(err.Error())
  857. }
  858. cpuGv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  859. Name: "node_cpu_hourly_cost",
  860. Help: "node_cpu_hourly_cost hourly cost for each cpu on this node",
  861. }, []string{"instance", "node"})
  862. ramGv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  863. Name: "node_ram_hourly_cost",
  864. Help: "node_ram_hourly_cost hourly cost for each gb of ram on this node",
  865. }, []string{"instance", "node"})
  866. gpuGv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  867. Name: "node_gpu_hourly_cost",
  868. Help: "node_gpu_hourly_cost hourly cost for each gpu on this node",
  869. }, []string{"instance", "node"})
  870. totalGv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  871. Name: "node_total_hourly_cost",
  872. Help: "node_total_hourly_cost Total node cost per hour",
  873. }, []string{"instance", "node"})
  874. pvGv := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  875. Name: "pv_hourly_cost",
  876. Help: "pv_hourly_cost Cost per GB per hour on a persistent disk",
  877. }, []string{"volumename", "persistentvolume"})
  878. RAMAllocation := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  879. Name: "container_memory_allocation_bytes",
  880. Help: "container_memory_allocation_bytes Bytes of RAM used",
  881. }, []string{"namespace", "pod", "container", "instance", "node"})
  882. CPUAllocation := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  883. Name: "container_cpu_allocation",
  884. Help: "container_cpu_allocation Percent of a single CPU used in a minute",
  885. }, []string{"namespace", "pod", "container", "instance", "node"})
  886. GPUAllocation := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  887. Name: "container_gpu_allocation",
  888. Help: "container_gpu_allocation GPU used",
  889. }, []string{"namespace", "pod", "container", "instance", "node"})
  890. PVAllocation := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  891. Name: "pod_pvc_allocation",
  892. Help: "pod_pvc_allocation Bytes used by a PVC attached to a pod",
  893. }, []string{"namespace", "pod", "persistentvolumeclaim", "persistentvolume"})
  894. ContainerUptimeRecorder := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  895. Name: "container_uptime_seconds",
  896. Help: "container_uptime_seconds Seconds a container has been running",
  897. }, []string{"namespace", "pod", "container"})
  898. NetworkZoneEgressRecorder := prometheus.NewGauge(prometheus.GaugeOpts{
  899. Name: "kubecost_network_zone_egress_cost",
  900. Help: "kubecost_network_zone_egress_cost Total cost per GB egress across zones",
  901. })
  902. NetworkRegionEgressRecorder := prometheus.NewGauge(prometheus.GaugeOpts{
  903. Name: "kubecost_network_region_egress_cost",
  904. Help: "kubecost_network_region_egress_cost Total cost per GB egress across regions",
  905. })
  906. NetworkInternetEgressRecorder := prometheus.NewGauge(prometheus.GaugeOpts{
  907. Name: "kubecost_network_internet_egress_cost",
  908. Help: "kubecost_network_internet_egress_cost Total cost per GB of internet egress.",
  909. })
  910. prometheus.MustRegister(cpuGv)
  911. prometheus.MustRegister(ramGv)
  912. prometheus.MustRegister(gpuGv)
  913. prometheus.MustRegister(totalGv)
  914. prometheus.MustRegister(pvGv)
  915. prometheus.MustRegister(RAMAllocation)
  916. prometheus.MustRegister(CPUAllocation)
  917. prometheus.MustRegister(ContainerUptimeRecorder)
  918. prometheus.MustRegister(PVAllocation)
  919. prometheus.MustRegister(NetworkZoneEgressRecorder, NetworkRegionEgressRecorder, NetworkInternetEgressRecorder)
  920. prometheus.MustRegister(ServiceCollector{
  921. KubeClientSet: kubeClientset,
  922. })
  923. prometheus.MustRegister(DeploymentCollector{
  924. KubeClientSet: kubeClientset,
  925. })
  926. // cache responses from model for a default of 5 minutes; clear expired responses every 10 minutes
  927. modelCache := cache.New(time.Minute*5, time.Minute*10)
  928. A = Accesses{
  929. PrometheusClient: promCli,
  930. KubeClientSet: kubeClientset,
  931. Cloud: cloudProvider,
  932. CPUPriceRecorder: cpuGv,
  933. RAMPriceRecorder: ramGv,
  934. GPUPriceRecorder: gpuGv,
  935. NodeTotalPriceRecorder: totalGv,
  936. RAMAllocationRecorder: RAMAllocation,
  937. CPUAllocationRecorder: CPUAllocation,
  938. GPUAllocationRecorder: GPUAllocation,
  939. PVAllocationRecorder: PVAllocation,
  940. ContainerUptimeRecorder: ContainerUptimeRecorder,
  941. NetworkZoneEgressRecorder: NetworkZoneEgressRecorder,
  942. NetworkRegionEgressRecorder: NetworkRegionEgressRecorder,
  943. NetworkInternetEgressRecorder: NetworkInternetEgressRecorder,
  944. PersistentVolumePriceRecorder: pvGv,
  945. Model: NewCostModel(kubeClientset),
  946. Cache: modelCache,
  947. }
  948. remoteEnabled := os.Getenv(remoteEnabled)
  949. if remoteEnabled == "true" {
  950. info, err := cloudProvider.ClusterInfo()
  951. klog.Infof("Saving cluster with id:'%s', and name:'%s' to durable storage", info["id"], info["name"])
  952. if err != nil {
  953. klog.Infof("Error saving cluster id %s", err.Error())
  954. }
  955. _, _, err = costAnalyzerCloud.GetOrCreateClusterMeta(info["id"], info["name"])
  956. if err != nil {
  957. klog.Infof("Unable to set cluster id '%s' for cluster '%s', %s", info["id"], info["name"], err.Error())
  958. }
  959. }
  960. // Thanos Client
  961. if os.Getenv(thanosEnabled) == "true" {
  962. thanosUrl := os.Getenv(thanosQueryUrl)
  963. if thanosUrl != "" {
  964. var thanosRT http.RoundTripper = &http.Transport{
  965. Proxy: http.ProxyFromEnvironment,
  966. DialContext: (&net.Dialer{
  967. Timeout: 120 * time.Second,
  968. KeepAlive: 120 * time.Second,
  969. }).DialContext,
  970. TLSHandshakeTimeout: 10 * time.Second,
  971. }
  972. thanosConfig := prometheusClient.Config{
  973. Address: thanosUrl,
  974. RoundTripper: thanosRT,
  975. }
  976. thanosCli, _ := prometheusClient.NewClient(thanosConfig)
  977. _, err = ValidatePrometheus(thanosCli, true)
  978. if err != nil {
  979. klog.Fatalf("Failed to query Thanos at %s. Error: %s.", thanosUrl, err.Error())
  980. } else {
  981. klog.V(1).Info("Success: retrieved the 'up' query against Thanos at: " + thanosUrl)
  982. A.ThanosClient = thanosCli
  983. }
  984. } else {
  985. klog.Infof("Error resolving environment variable: $%s", thanosQueryUrl)
  986. }
  987. }
  988. err = A.Cloud.DownloadPricingData()
  989. if err != nil {
  990. klog.V(1).Info("Failed to download pricing data: " + err.Error())
  991. }
  992. A.recordPrices()
  993. Router.GET("/costDataModel", A.CostDataModel)
  994. Router.GET("/costDataModelRange", A.CostDataModelRange)
  995. Router.GET("/costDataModelRangeLarge", A.CostDataModelRangeLarge)
  996. Router.GET("/outOfClusterCosts", A.OutofClusterCosts)
  997. Router.GET("/allNodePricing", A.GetAllNodePricing)
  998. Router.GET("/healthz", Healthz)
  999. Router.GET("/getConfigs", A.GetConfigs)
  1000. Router.POST("/refreshPricing", A.RefreshPricingData)
  1001. Router.POST("/updateSpotInfoConfigs", A.UpdateSpotInfoConfigs)
  1002. Router.POST("/updateAthenaInfoConfigs", A.UpdateAthenaInfoConfigs)
  1003. Router.POST("/updateBigQueryInfoConfigs", A.UpdateBigQueryInfoConfigs)
  1004. Router.POST("/updateConfigByKey", A.UpdateConfigByKey)
  1005. Router.GET("/clusterCostsOverTime", A.ClusterCostsOverTime)
  1006. Router.GET("/clusterCosts", A.ClusterCosts)
  1007. Router.GET("/validatePrometheus", A.GetPrometheusMetadata)
  1008. Router.GET("/managementPlatform", A.ManagementPlatform)
  1009. Router.GET("/clusterInfo", A.ClusterInfo)
  1010. Router.GET("/containerUptimes", A.ContainerUptimes)
  1011. Router.GET("/aggregatedCostModel", A.AggregateCostModel)
  1012. }