aggregation.go 68 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831
  1. package costmodel
  2. import (
  3. "fmt"
  4. "math"
  5. "net/http"
  6. "regexp"
  7. "sort"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "github.com/julienschmidt/httprouter"
  12. "github.com/kubecost/cost-model/pkg/cloud"
  13. "github.com/kubecost/cost-model/pkg/env"
  14. "github.com/kubecost/cost-model/pkg/kubecost"
  15. "github.com/kubecost/cost-model/pkg/log"
  16. "github.com/kubecost/cost-model/pkg/prom"
  17. "github.com/kubecost/cost-model/pkg/thanos"
  18. "github.com/kubecost/cost-model/pkg/util"
  19. "github.com/patrickmn/go-cache"
  20. prometheusClient "github.com/prometheus/client_golang/api"
  21. "k8s.io/klog"
  22. )
  23. const (
  24. // SplitTypeWeighted signals that shared costs should be shared
  25. // proportionally, rather than evenly
  26. SplitTypeWeighted = "weighted"
  27. // UnallocatedSubfield indicates an allocation datum that does not have the
  28. // chosen Aggregator; e.g. during aggregation by some label, there may be
  29. // cost data that do not have the given label.
  30. UnallocatedSubfield = "__unallocated__"
  31. )
  32. // Aggregation describes aggregated cost data, containing cumulative cost and
  33. // allocation data per resource, vectors of rate data per resource, efficiency
  34. // data, and metadata describing the type of aggregation operation.
  35. type Aggregation struct {
  36. Aggregator string `json:"aggregation"`
  37. Subfields []string `json:"subfields,omitempty"`
  38. Environment string `json:"environment"`
  39. Cluster string `json:"cluster,omitempty"`
  40. Properties *kubecost.Properties `json:"-"`
  41. CPUAllocationHourlyAverage float64 `json:"cpuAllocationAverage"`
  42. CPUAllocationVectors []*util.Vector `json:"-"`
  43. CPUAllocationTotal float64 `json:"-"`
  44. CPUCost float64 `json:"cpuCost"`
  45. CPUCostVector []*util.Vector `json:"cpuCostVector,omitempty"`
  46. CPUEfficiency float64 `json:"cpuEfficiency"`
  47. CPURequestedVectors []*util.Vector `json:"-"`
  48. CPUUsedVectors []*util.Vector `json:"-"`
  49. Efficiency float64 `json:"efficiency"`
  50. GPUAllocationHourlyAverage float64 `json:"gpuAllocationAverage"`
  51. GPUAllocationVectors []*util.Vector `json:"-"`
  52. GPUCost float64 `json:"gpuCost"`
  53. GPUCostVector []*util.Vector `json:"gpuCostVector,omitempty"`
  54. GPUAllocationTotal float64 `json:"-"`
  55. RAMAllocationHourlyAverage float64 `json:"ramAllocationAverage"`
  56. RAMAllocationVectors []*util.Vector `json:"-"`
  57. RAMAllocationTotal float64 `json:"-"`
  58. RAMCost float64 `json:"ramCost"`
  59. RAMCostVector []*util.Vector `json:"ramCostVector,omitempty"`
  60. RAMEfficiency float64 `json:"ramEfficiency"`
  61. RAMRequestedVectors []*util.Vector `json:"-"`
  62. RAMUsedVectors []*util.Vector `json:"-"`
  63. PVAllocationHourlyAverage float64 `json:"pvAllocationAverage"`
  64. PVAllocationVectors []*util.Vector `json:"-"`
  65. PVAllocationTotal float64 `json:"-"`
  66. PVCost float64 `json:"pvCost"`
  67. PVCostVector []*util.Vector `json:"pvCostVector,omitempty"`
  68. NetworkCost float64 `json:"networkCost"`
  69. NetworkCostVector []*util.Vector `json:"networkCostVector,omitempty"`
  70. SharedCost float64 `json:"sharedCost"`
  71. TotalCost float64 `json:"totalCost"`
  72. TotalCostVector []*util.Vector `json:"totalCostVector,omitempty"`
  73. }
  74. // TotalHours determines the amount of hours the Aggregation covers, as a
  75. // function of the cost vectors and the resolution of those vectors' data
  76. func (a *Aggregation) TotalHours(resolutionHours float64) float64 {
  77. length := 1
  78. if length < len(a.CPUCostVector) {
  79. length = len(a.CPUCostVector)
  80. }
  81. if length < len(a.RAMCostVector) {
  82. length = len(a.RAMCostVector)
  83. }
  84. if length < len(a.PVCostVector) {
  85. length = len(a.PVCostVector)
  86. }
  87. if length < len(a.GPUCostVector) {
  88. length = len(a.GPUCostVector)
  89. }
  90. if length < len(a.NetworkCostVector) {
  91. length = len(a.NetworkCostVector)
  92. }
  93. return float64(length) * resolutionHours
  94. }
  95. // RateCoefficient computes the coefficient by which the total cost needs to be
  96. // multiplied in order to convert totals costs into per-rate costs.
  97. func (a *Aggregation) RateCoefficient(rateStr string, resolutionHours float64) float64 {
  98. // monthly rate = (730.0)*(total cost)/(total hours)
  99. // daily rate = (24.0)*(total cost)/(total hours)
  100. // hourly rate = (1.0)*(total cost)/(total hours)
  101. // default to hourly rate
  102. coeff := 1.0
  103. switch rateStr {
  104. case "daily":
  105. coeff = util.HoursPerDay
  106. case "monthly":
  107. coeff = util.HoursPerMonth
  108. }
  109. return coeff / a.TotalHours(resolutionHours)
  110. }
  111. type SharedResourceInfo struct {
  112. ShareResources bool
  113. SharedNamespace map[string]bool
  114. LabelSelectors map[string]map[string]bool
  115. }
  116. type SharedCostInfo struct {
  117. Name string
  118. Cost float64
  119. ShareType string
  120. }
  121. func (s *SharedResourceInfo) IsSharedResource(costDatum *CostData) bool {
  122. // exists in a shared namespace
  123. if _, ok := s.SharedNamespace[costDatum.Namespace]; ok {
  124. return true
  125. }
  126. // has at least one shared label (OR, not AND in the case of multiple labels)
  127. for labelName, labelValues := range s.LabelSelectors {
  128. if val, ok := costDatum.Labels[labelName]; ok && labelValues[val] {
  129. return true
  130. }
  131. }
  132. return false
  133. }
  134. func NewSharedResourceInfo(shareResources bool, sharedNamespaces []string, labelNames []string, labelValues []string) *SharedResourceInfo {
  135. sr := &SharedResourceInfo{
  136. ShareResources: shareResources,
  137. SharedNamespace: make(map[string]bool),
  138. LabelSelectors: make(map[string]map[string]bool),
  139. }
  140. for _, ns := range sharedNamespaces {
  141. sr.SharedNamespace[strings.Trim(ns, " ")] = true
  142. }
  143. // Creating a map of label name to label value, but only if
  144. // the cardinality matches
  145. if len(labelNames) == len(labelValues) {
  146. for i := range labelNames {
  147. cleanedLname := SanitizeLabelName(strings.Trim(labelNames[i], " "))
  148. if values, ok := sr.LabelSelectors[cleanedLname]; ok {
  149. values[strings.Trim(labelValues[i], " ")] = true
  150. } else {
  151. sr.LabelSelectors[cleanedLname] = map[string]bool{strings.Trim(labelValues[i], " "): true}
  152. }
  153. }
  154. }
  155. return sr
  156. }
  157. func GetTotalContainerCost(costData map[string]*CostData, rate string, cp cloud.Provider, discount float64, customDiscount float64, idleCoefficients map[string]float64) float64 {
  158. totalContainerCost := 0.0
  159. for _, costDatum := range costData {
  160. clusterID := costDatum.ClusterID
  161. cpuv, ramv, gpuv, pvvs, netv := getPriceVectors(cp, costDatum, rate, discount, customDiscount, idleCoefficients[clusterID])
  162. totalContainerCost += totalVectors(cpuv)
  163. totalContainerCost += totalVectors(ramv)
  164. totalContainerCost += totalVectors(gpuv)
  165. for _, pv := range pvvs {
  166. totalContainerCost += totalVectors(pv)
  167. }
  168. totalContainerCost += totalVectors(netv)
  169. }
  170. return totalContainerCost
  171. }
  172. func (a *Accesses) ComputeIdleCoefficient(costData map[string]*CostData, cli prometheusClient.Client, cp cloud.Provider, discount float64, customDiscount float64, windowString, offset string) (map[string]float64, error) {
  173. coefficients := make(map[string]float64)
  174. profileName := "ComputeIdleCoefficient: ComputeClusterCosts"
  175. profileStart := time.Now()
  176. var clusterCosts map[string]*ClusterCosts
  177. var err error
  178. key := fmt.Sprintf("%s:%s", windowString, offset)
  179. if data, valid := a.ClusterCostsCache.Get(key); valid {
  180. clusterCosts = data.(map[string]*ClusterCosts)
  181. } else {
  182. clusterCosts, err = a.ComputeClusterCosts(cli, cp, windowString, offset, false)
  183. if err != nil {
  184. return nil, err
  185. }
  186. }
  187. measureTime(profileStart, profileThreshold, profileName)
  188. for cid, costs := range clusterCosts {
  189. if costs.CPUCumulative == 0 && costs.RAMCumulative == 0 && costs.StorageCumulative == 0 {
  190. klog.V(1).Infof("[Warning] No ClusterCosts data for cluster '%s'. Is it emitting data?", cid)
  191. coefficients[cid] = 1.0
  192. continue
  193. }
  194. if costs.TotalCumulative == 0 {
  195. return nil, fmt.Errorf("TotalCumulative cluster cost for cluster '%s' returned 0 over window '%s' offset '%s'", cid, windowString, offset)
  196. }
  197. totalContainerCost := 0.0
  198. for _, costDatum := range costData {
  199. if costDatum.ClusterID == cid {
  200. cpuv, ramv, gpuv, pvvs, _ := getPriceVectors(cp, costDatum, "", discount, customDiscount, 1)
  201. totalContainerCost += totalVectors(cpuv)
  202. totalContainerCost += totalVectors(ramv)
  203. totalContainerCost += totalVectors(gpuv)
  204. for _, pv := range pvvs {
  205. totalContainerCost += totalVectors(pv)
  206. }
  207. }
  208. }
  209. coeff := totalContainerCost / costs.TotalCumulative
  210. coefficients[cid] = coeff
  211. }
  212. return coefficients, nil
  213. }
  214. // AggregationOptions provides optional parameters to AggregateCostData, allowing callers to perform more complex operations
  215. type AggregationOptions struct {
  216. Discount float64 // percent by which to discount CPU, RAM, and GPU cost
  217. CustomDiscount float64 // additional custom discount applied to all prices
  218. IdleCoefficients map[string]float64 // scales costs by amount of idle resources on a per-cluster basis
  219. IncludeEfficiency bool // set to true to receive efficiency/usage data
  220. IncludeTimeSeries bool // set to true to receive time series data
  221. Rate string // set to "hourly", "daily", or "monthly" to receive cost rate, rather than cumulative cost
  222. ResolutionHours float64
  223. SharedResourceInfo *SharedResourceInfo
  224. SharedCosts map[string]*SharedCostInfo
  225. FilteredContainerCount int
  226. FilteredEnvironments map[string]int
  227. SharedSplit string
  228. TotalContainerCost float64
  229. }
  230. // Helper method to test request/usgae values against allocation averages for efficiency scores. Generate a warning log if
  231. // clamp is required
  232. func clampAverage(requestsAvg float64, usedAverage float64, allocationAvg float64, resource string) (float64, float64) {
  233. rAvg := requestsAvg
  234. if rAvg > allocationAvg {
  235. klog.V(4).Infof("[Warning] Average %s Requested (%f) > Average %s Allocated (%f). Clamping.", resource, rAvg, resource, allocationAvg)
  236. rAvg = allocationAvg
  237. }
  238. uAvg := usedAverage
  239. if uAvg > allocationAvg {
  240. klog.V(4).Infof("[Warning]: Average %s Used (%f) > Average %s Allocated (%f). Clamping.", resource, uAvg, resource, allocationAvg)
  241. uAvg = allocationAvg
  242. }
  243. return rAvg, uAvg
  244. }
  245. // AggregateCostData aggregates raw cost data by field; e.g. namespace, cluster, service, or label. In the case of label, callers
  246. // must pass a slice of subfields indicating the labels by which to group. Provider is used to define custom resource pricing.
  247. // See AggregationOptions for optional parameters.
  248. func AggregateCostData(costData map[string]*CostData, field string, subfields []string, cp cloud.Provider, opts *AggregationOptions) map[string]*Aggregation {
  249. discount := opts.Discount
  250. customDiscount := opts.CustomDiscount
  251. idleCoefficients := opts.IdleCoefficients
  252. includeTimeSeries := opts.IncludeTimeSeries
  253. includeEfficiency := opts.IncludeEfficiency
  254. rate := opts.Rate
  255. sr := opts.SharedResourceInfo
  256. resolutionHours := 1.0
  257. if opts.ResolutionHours > 0.0 {
  258. resolutionHours = opts.ResolutionHours
  259. }
  260. if idleCoefficients == nil {
  261. idleCoefficients = make(map[string]float64)
  262. }
  263. // aggregations collects key-value pairs of resource group-to-aggregated data
  264. // e.g. namespace-to-data or label-value-to-data
  265. aggregations := make(map[string]*Aggregation)
  266. // sharedResourceCost is the running total cost of resources that should be reported
  267. // as shared across all other resources, rather than reported as a stand-alone category
  268. sharedResourceCost := 0.0
  269. for _, costDatum := range costData {
  270. idleCoefficient, ok := idleCoefficients[costDatum.ClusterID]
  271. if !ok {
  272. idleCoefficient = 1.0
  273. }
  274. if sr != nil && sr.ShareResources && sr.IsSharedResource(costDatum) {
  275. cpuv, ramv, gpuv, pvvs, netv := getPriceVectors(cp, costDatum, rate, discount, customDiscount, idleCoefficient)
  276. sharedResourceCost += totalVectors(cpuv)
  277. sharedResourceCost += totalVectors(ramv)
  278. sharedResourceCost += totalVectors(gpuv)
  279. sharedResourceCost += totalVectors(netv)
  280. for _, pv := range pvvs {
  281. sharedResourceCost += totalVectors(pv)
  282. }
  283. } else {
  284. if field == "cluster" {
  285. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, costDatum.ClusterID, discount, customDiscount, idleCoefficient, false)
  286. } else if field == "node" {
  287. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, costDatum.NodeName, discount, customDiscount, idleCoefficient, false)
  288. } else if field == "namespace" {
  289. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, costDatum.Namespace, discount, customDiscount, idleCoefficient, false)
  290. } else if field == "service" {
  291. if len(costDatum.Services) > 0 {
  292. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, costDatum.Namespace+"/"+costDatum.Services[0], discount, customDiscount, idleCoefficient, false)
  293. } else {
  294. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, UnallocatedSubfield, discount, customDiscount, idleCoefficient, false)
  295. }
  296. } else if field == "deployment" {
  297. if len(costDatum.Deployments) > 0 {
  298. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, costDatum.Namespace+"/"+costDatum.Deployments[0], discount, customDiscount, idleCoefficient, false)
  299. } else {
  300. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, UnallocatedSubfield, discount, customDiscount, idleCoefficient, false)
  301. }
  302. } else if field == "statefulset" {
  303. if len(costDatum.Statefulsets) > 0 {
  304. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, costDatum.Namespace+"/"+costDatum.Statefulsets[0], discount, customDiscount, idleCoefficient, false)
  305. } else {
  306. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, UnallocatedSubfield, discount, customDiscount, idleCoefficient, false)
  307. }
  308. } else if field == "daemonset" {
  309. if len(costDatum.Daemonsets) > 0 {
  310. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, costDatum.Namespace+"/"+costDatum.Daemonsets[0], discount, customDiscount, idleCoefficient, false)
  311. } else {
  312. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, UnallocatedSubfield, discount, customDiscount, idleCoefficient, false)
  313. }
  314. } else if field == "controller" {
  315. if controller, kind, hasController := costDatum.GetController(); hasController {
  316. key := fmt.Sprintf("%s/%s:%s", costDatum.Namespace, kind, controller)
  317. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, key, discount, customDiscount, idleCoefficient, false)
  318. } else {
  319. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, UnallocatedSubfield, discount, customDiscount, idleCoefficient, false)
  320. }
  321. } else if field == "label" {
  322. found := false
  323. if costDatum.Labels != nil {
  324. for _, sf := range subfields {
  325. if subfieldName, ok := costDatum.Labels[sf]; ok {
  326. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, subfieldName, discount, customDiscount, idleCoefficient, false)
  327. found = true
  328. break
  329. }
  330. }
  331. }
  332. if !found {
  333. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, UnallocatedSubfield, discount, customDiscount, idleCoefficient, false)
  334. }
  335. } else if field == "pod" {
  336. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, costDatum.Namespace+"/"+costDatum.PodName, discount, customDiscount, idleCoefficient, false)
  337. } else if field == "container" {
  338. key := fmt.Sprintf("%s/%s/%s/%s", costDatum.ClusterID, costDatum.Namespace, costDatum.PodName, costDatum.Name)
  339. aggregateDatum(cp, aggregations, costDatum, field, subfields, rate, key, discount, customDiscount, idleCoefficient, true)
  340. }
  341. }
  342. }
  343. for key, agg := range aggregations {
  344. sharedCoefficient := 1 / float64(len(opts.FilteredEnvironments)+len(aggregations))
  345. agg.CPUCost = totalVectors(agg.CPUCostVector)
  346. agg.RAMCost = totalVectors(agg.RAMCostVector)
  347. agg.GPUCost = totalVectors(agg.GPUCostVector)
  348. agg.PVCost = totalVectors(agg.PVCostVector)
  349. agg.NetworkCost = totalVectors(agg.NetworkCostVector)
  350. if opts.SharedSplit == SplitTypeWeighted {
  351. d := opts.TotalContainerCost - sharedResourceCost
  352. if d == 0 {
  353. klog.V(1).Infof("[Warning] Total container cost '%f' and shared resource cost '%f are the same'. Setting sharedCoefficient to 1", opts.TotalContainerCost, sharedResourceCost)
  354. sharedCoefficient = 1.0
  355. } else {
  356. sharedCoefficient = (agg.CPUCost + agg.RAMCost + agg.GPUCost + agg.PVCost + agg.NetworkCost) / d
  357. }
  358. }
  359. agg.SharedCost = sharedResourceCost * sharedCoefficient
  360. for _, v := range opts.SharedCosts {
  361. agg.SharedCost += v.Cost * sharedCoefficient
  362. }
  363. if rate != "" {
  364. rateCoeff := agg.RateCoefficient(rate, resolutionHours)
  365. agg.CPUCost *= rateCoeff
  366. agg.RAMCost *= rateCoeff
  367. agg.GPUCost *= rateCoeff
  368. agg.PVCost *= rateCoeff
  369. agg.NetworkCost *= rateCoeff
  370. agg.SharedCost *= rateCoeff
  371. }
  372. agg.TotalCost = agg.CPUCost + agg.RAMCost + agg.GPUCost + agg.PVCost + agg.NetworkCost + agg.SharedCost
  373. // Evicted and Completed Pods can still show up here, but have 0 cost.
  374. // Filter these by default. Any reason to keep them?
  375. if agg.TotalCost == 0 {
  376. delete(aggregations, key)
  377. continue
  378. }
  379. // CPU, RAM, and PV allocation are cumulative per-datum, whereas GPU is rate per-datum
  380. agg.CPUAllocationHourlyAverage = totalVectors(agg.CPUAllocationVectors) / agg.TotalHours(resolutionHours)
  381. agg.RAMAllocationHourlyAverage = totalVectors(agg.RAMAllocationVectors) / agg.TotalHours(resolutionHours)
  382. agg.GPUAllocationHourlyAverage = averageVectors(agg.GPUAllocationVectors)
  383. agg.PVAllocationHourlyAverage = totalVectors(agg.PVAllocationVectors) / agg.TotalHours(resolutionHours)
  384. // TODO niko/etl does this check out for GPU data? Do we need to rewrite GPU queries to be
  385. // culumative?
  386. agg.CPUAllocationTotal = totalVectors(agg.CPUAllocationVectors)
  387. agg.GPUAllocationTotal = totalVectors(agg.GPUAllocationVectors)
  388. agg.PVAllocationTotal = totalVectors(agg.PVAllocationVectors)
  389. agg.RAMAllocationTotal = totalVectors(agg.RAMAllocationVectors)
  390. if includeEfficiency {
  391. // Default both RAM and CPU to 0% efficiency so that a 0-requested, 0-allocated, 0-used situation
  392. // returns 0% efficiency, which should be a red-flag.
  393. //
  394. // If non-zero numbers are available, then efficiency is defined as:
  395. // idlePercentage = (requested - used) / allocated
  396. // efficiency = (1.0 - idlePercentage)
  397. //
  398. // It is possible to score > 100% efficiency, which is meant to be interpreted as a red flag.
  399. // It is not possible to score < 0% efficiency.
  400. agg.CPUEfficiency = 0.0
  401. CPUIdle := 0.0
  402. if agg.CPUAllocationHourlyAverage > 0.0 {
  403. avgCPURequested := averageVectors(agg.CPURequestedVectors)
  404. avgCPUUsed := averageVectors(agg.CPUUsedVectors)
  405. // Clamp averages, log range violations
  406. avgCPURequested, avgCPUUsed = clampAverage(avgCPURequested, avgCPUUsed, agg.CPUAllocationHourlyAverage, "CPU")
  407. CPUIdle = ((avgCPURequested - avgCPUUsed) / agg.CPUAllocationHourlyAverage)
  408. agg.CPUEfficiency = 1.0 - CPUIdle
  409. }
  410. agg.RAMEfficiency = 0.0
  411. RAMIdle := 0.0
  412. if agg.RAMAllocationHourlyAverage > 0.0 {
  413. avgRAMRequested := averageVectors(agg.RAMRequestedVectors)
  414. avgRAMUsed := averageVectors(agg.RAMUsedVectors)
  415. // Clamp averages, log range violations
  416. avgRAMRequested, avgRAMUsed = clampAverage(avgRAMRequested, avgRAMUsed, agg.RAMAllocationHourlyAverage, "RAM")
  417. RAMIdle = ((avgRAMRequested - avgRAMUsed) / agg.RAMAllocationHourlyAverage)
  418. agg.RAMEfficiency = 1.0 - RAMIdle
  419. }
  420. // Score total efficiency by the sum of CPU and RAM efficiency, weighted by their
  421. // respective total costs.
  422. agg.Efficiency = 0.0
  423. if (agg.CPUCost + agg.RAMCost) > 0 {
  424. agg.Efficiency = ((agg.CPUCost * agg.CPUEfficiency) + (agg.RAMCost * agg.RAMEfficiency)) / (agg.CPUCost + agg.RAMCost)
  425. }
  426. }
  427. // convert RAM from bytes to GiB
  428. agg.RAMAllocationHourlyAverage = agg.RAMAllocationHourlyAverage / 1024 / 1024 / 1024
  429. // convert storage from bytes to GiB
  430. agg.PVAllocationHourlyAverage = agg.PVAllocationHourlyAverage / 1024 / 1024 / 1024
  431. // remove time series data if it is not explicitly requested
  432. if !includeTimeSeries {
  433. agg.CPUCostVector = nil
  434. agg.RAMCostVector = nil
  435. agg.GPUCostVector = nil
  436. agg.PVCostVector = nil
  437. agg.NetworkCostVector = nil
  438. agg.TotalCostVector = nil
  439. } else { // otherwise compute a totalcostvector
  440. v1 := addVectors(agg.CPUCostVector, agg.RAMCostVector)
  441. v2 := addVectors(v1, agg.GPUCostVector)
  442. v3 := addVectors(v2, agg.PVCostVector)
  443. v4 := addVectors(v3, agg.NetworkCostVector)
  444. agg.TotalCostVector = v4
  445. }
  446. // Typesafety checks
  447. if math.IsNaN(agg.CPUAllocationHourlyAverage) || math.IsInf(agg.CPUAllocationHourlyAverage, 0) {
  448. klog.V(1).Infof("[Warning] CPUAllocationHourlyAverage is %f for '%s: %s/%s'", agg.CPUAllocationHourlyAverage, agg.Cluster, agg.Aggregator, agg.Environment)
  449. agg.CPUAllocationHourlyAverage = 0
  450. }
  451. if math.IsNaN(agg.CPUCost) || math.IsInf(agg.CPUCost, 0) {
  452. klog.V(1).Infof("[Warning] CPUCost is %f for '%s: %s/%s'", agg.CPUCost, agg.Cluster, agg.Aggregator, agg.Environment)
  453. agg.CPUCost = 0
  454. }
  455. if math.IsNaN(agg.CPUEfficiency) || math.IsInf(agg.CPUEfficiency, 0) {
  456. klog.V(1).Infof("[Warning] CPUEfficiency is %f for '%s: %s/%s'", agg.CPUEfficiency, agg.Cluster, agg.Aggregator, agg.Environment)
  457. agg.CPUEfficiency = 0
  458. }
  459. if math.IsNaN(agg.Efficiency) || math.IsInf(agg.Efficiency, 0) {
  460. klog.V(1).Infof("[Warning] Efficiency is %f for '%s: %s/%s'", agg.Efficiency, agg.Cluster, agg.Aggregator, agg.Environment)
  461. agg.Efficiency = 0
  462. }
  463. if math.IsNaN(agg.GPUAllocationHourlyAverage) || math.IsInf(agg.GPUAllocationHourlyAverage, 0) {
  464. klog.V(1).Infof("[Warning] GPUAllocationHourlyAverage is %f for '%s: %s/%s'", agg.GPUAllocationHourlyAverage, agg.Cluster, agg.Aggregator, agg.Environment)
  465. agg.GPUAllocationHourlyAverage = 0
  466. }
  467. if math.IsNaN(agg.GPUCost) || math.IsInf(agg.GPUCost, 0) {
  468. klog.V(1).Infof("[Warning] GPUCost is %f for '%s: %s/%s'", agg.GPUCost, agg.Cluster, agg.Aggregator, agg.Environment)
  469. agg.GPUCost = 0
  470. }
  471. if math.IsNaN(agg.RAMAllocationHourlyAverage) || math.IsInf(agg.RAMAllocationHourlyAverage, 0) {
  472. klog.V(1).Infof("[Warning] RAMAllocationHourlyAverage is %f for '%s: %s/%s'", agg.RAMAllocationHourlyAverage, agg.Cluster, agg.Aggregator, agg.Environment)
  473. agg.RAMAllocationHourlyAverage = 0
  474. }
  475. if math.IsNaN(agg.RAMCost) || math.IsInf(agg.RAMCost, 0) {
  476. klog.V(1).Infof("[Warning] RAMCost is %f for '%s: %s/%s'", agg.RAMCost, agg.Cluster, agg.Aggregator, agg.Environment)
  477. agg.RAMCost = 0
  478. }
  479. if math.IsNaN(agg.RAMEfficiency) || math.IsInf(agg.RAMEfficiency, 0) {
  480. klog.V(1).Infof("[Warning] RAMEfficiency is %f for '%s: %s/%s'", agg.RAMEfficiency, agg.Cluster, agg.Aggregator, agg.Environment)
  481. agg.RAMEfficiency = 0
  482. }
  483. if math.IsNaN(agg.PVAllocationHourlyAverage) || math.IsInf(agg.PVAllocationHourlyAverage, 0) {
  484. klog.V(1).Infof("[Warning] PVAllocationHourlyAverage is %f for '%s: %s/%s'", agg.PVAllocationHourlyAverage, agg.Cluster, agg.Aggregator, agg.Environment)
  485. agg.PVAllocationHourlyAverage = 0
  486. }
  487. if math.IsNaN(agg.PVCost) || math.IsInf(agg.PVCost, 0) {
  488. klog.V(1).Infof("[Warning] PVCost is %f for '%s: %s/%s'", agg.PVCost, agg.Cluster, agg.Aggregator, agg.Environment)
  489. agg.PVCost = 0
  490. }
  491. if math.IsNaN(agg.NetworkCost) || math.IsInf(agg.NetworkCost, 0) {
  492. klog.V(1).Infof("[Warning] NetworkCost is %f for '%s: %s/%s'", agg.NetworkCost, agg.Cluster, agg.Aggregator, agg.Environment)
  493. agg.NetworkCost = 0
  494. }
  495. if math.IsNaN(agg.SharedCost) || math.IsInf(agg.SharedCost, 0) {
  496. klog.V(1).Infof("[Warning] SharedCost is %f for '%s: %s/%s'", agg.SharedCost, agg.Cluster, agg.Aggregator, agg.Environment)
  497. agg.SharedCost = 0
  498. }
  499. if math.IsNaN(agg.TotalCost) || math.IsInf(agg.TotalCost, 0) {
  500. klog.V(1).Infof("[Warning] TotalCost is %f for '%s: %s/%s'", agg.TotalCost, agg.Cluster, agg.Aggregator, agg.Environment)
  501. agg.TotalCost = 0
  502. }
  503. }
  504. return aggregations
  505. }
  506. func aggregateDatum(cp cloud.Provider, aggregations map[string]*Aggregation, costDatum *CostData, field string, subfields []string, rate string, key string, discount float64, customDiscount float64, idleCoefficient float64, includeProperties bool) {
  507. // add new entry to aggregation results if a new key is encountered
  508. if _, ok := aggregations[key]; !ok {
  509. agg := &Aggregation{
  510. Aggregator: field,
  511. Environment: key,
  512. }
  513. if len(subfields) > 0 {
  514. agg.Subfields = subfields
  515. }
  516. if includeProperties {
  517. props := &kubecost.Properties{}
  518. props.SetCluster(costDatum.ClusterID)
  519. props.SetNode(costDatum.NodeName)
  520. if controller, kind, hasController := costDatum.GetController(); hasController {
  521. props.SetController(controller)
  522. props.SetControllerKind(kind)
  523. }
  524. props.SetLabels(costDatum.Labels)
  525. props.SetNamespace(costDatum.Namespace)
  526. props.SetPod(costDatum.PodName)
  527. props.SetServices(costDatum.Services)
  528. props.SetContainer(costDatum.Name)
  529. agg.Properties = props
  530. }
  531. aggregations[key] = agg
  532. }
  533. mergeVectors(cp, costDatum, aggregations[key], rate, discount, customDiscount, idleCoefficient)
  534. }
  535. func mergeVectors(cp cloud.Provider, costDatum *CostData, aggregation *Aggregation, rate string, discount float64, customDiscount float64, idleCoefficient float64) {
  536. aggregation.CPUAllocationVectors = addVectors(costDatum.CPUAllocation, aggregation.CPUAllocationVectors)
  537. aggregation.CPURequestedVectors = addVectors(costDatum.CPUReq, aggregation.CPURequestedVectors)
  538. aggregation.CPUUsedVectors = addVectors(costDatum.CPUUsed, aggregation.CPUUsedVectors)
  539. aggregation.RAMAllocationVectors = addVectors(costDatum.RAMAllocation, aggregation.RAMAllocationVectors)
  540. aggregation.RAMRequestedVectors = addVectors(costDatum.RAMReq, aggregation.RAMRequestedVectors)
  541. aggregation.RAMUsedVectors = addVectors(costDatum.RAMUsed, aggregation.RAMUsedVectors)
  542. aggregation.GPUAllocationVectors = addVectors(costDatum.GPUReq, aggregation.GPUAllocationVectors)
  543. for _, pvcd := range costDatum.PVCData {
  544. aggregation.PVAllocationVectors = addVectors(pvcd.Values, aggregation.PVAllocationVectors)
  545. }
  546. cpuv, ramv, gpuv, pvvs, netv := getPriceVectors(cp, costDatum, rate, discount, customDiscount, idleCoefficient)
  547. aggregation.CPUCostVector = addVectors(cpuv, aggregation.CPUCostVector)
  548. aggregation.RAMCostVector = addVectors(ramv, aggregation.RAMCostVector)
  549. aggregation.GPUCostVector = addVectors(gpuv, aggregation.GPUCostVector)
  550. aggregation.NetworkCostVector = addVectors(netv, aggregation.NetworkCostVector)
  551. for _, vectorList := range pvvs {
  552. aggregation.PVCostVector = addVectors(aggregation.PVCostVector, vectorList)
  553. }
  554. }
  555. // Returns the blended discounts applied to the node as a result of global discounts and reserved instance
  556. // discounts
  557. func getDiscounts(costDatum *CostData, cpuCost float64, ramCost float64, discount float64) (float64, float64) {
  558. if costDatum.NodeData == nil {
  559. return discount, discount
  560. }
  561. if costDatum.NodeData.IsSpot() {
  562. return 0, 0
  563. }
  564. reserved := costDatum.NodeData.Reserved
  565. // blended discounts
  566. blendedCPUDiscount := discount
  567. blendedRAMDiscount := discount
  568. if reserved != nil && reserved.CPUCost > 0 && reserved.RAMCost > 0 {
  569. reservedCPUDiscount := 0.0
  570. if cpuCost == 0 {
  571. klog.V(1).Infof("[Warning] No cpu cost found for cluster '%s' node '%s'", costDatum.ClusterID, costDatum.NodeName)
  572. } else {
  573. reservedCPUDiscount = 1.0 - (reserved.CPUCost / cpuCost)
  574. }
  575. reservedRAMDiscount := 0.0
  576. if ramCost == 0 {
  577. klog.V(1).Infof("[Warning] No ram cost found for cluster '%s' node '%s'", costDatum.ClusterID, costDatum.NodeName)
  578. } else {
  579. reservedRAMDiscount = 1.0 - (reserved.RAMCost / ramCost)
  580. }
  581. // AWS passes the # of reserved CPU and RAM as -1 to represent "All"
  582. if reserved.ReservedCPU < 0 && reserved.ReservedRAM < 0 {
  583. blendedCPUDiscount = reservedCPUDiscount
  584. blendedRAMDiscount = reservedRAMDiscount
  585. } else {
  586. nodeCPU, ierr := strconv.ParseInt(costDatum.NodeData.VCPU, 10, 64)
  587. nodeRAM, ferr := strconv.ParseFloat(costDatum.NodeData.RAMBytes, 64)
  588. if ierr == nil && ferr == nil {
  589. nodeRAMGB := nodeRAM / 1024 / 1024 / 1024
  590. reservedRAMGB := float64(reserved.ReservedRAM) / 1024 / 1024 / 1024
  591. nonReservedCPU := nodeCPU - reserved.ReservedCPU
  592. nonReservedRAM := nodeRAMGB - reservedRAMGB
  593. if nonReservedCPU == 0 {
  594. blendedCPUDiscount = reservedCPUDiscount
  595. } else {
  596. if nodeCPU == 0 {
  597. klog.V(1).Infof("[Warning] No ram found for cluster '%s' node '%s'", costDatum.ClusterID, costDatum.NodeName)
  598. } else {
  599. blendedCPUDiscount = (float64(reserved.ReservedCPU) * reservedCPUDiscount) + (float64(nonReservedCPU)*discount)/float64(nodeCPU)
  600. }
  601. }
  602. if nonReservedRAM == 0 {
  603. blendedRAMDiscount = reservedRAMDiscount
  604. } else {
  605. if nodeRAMGB == 0 {
  606. klog.V(1).Infof("[Warning] No ram found for cluster '%s' node '%s'", costDatum.ClusterID, costDatum.NodeName)
  607. } else {
  608. blendedRAMDiscount = (reservedRAMGB * reservedRAMDiscount) + (nonReservedRAM*discount)/nodeRAMGB
  609. }
  610. }
  611. }
  612. }
  613. }
  614. return blendedCPUDiscount, blendedRAMDiscount
  615. }
  616. func parseVectorPricing(cfg *cloud.CustomPricing, costDatum *CostData, cpuCostStr, ramCostStr, gpuCostStr, pvCostStr string) (float64, float64, float64, float64, bool) {
  617. usesCustom := false
  618. cpuCost, err := strconv.ParseFloat(cpuCostStr, 64)
  619. if err != nil || math.IsNaN(cpuCost) || math.IsInf(cpuCost, 0) || cpuCost == 0 {
  620. cpuCost, err = strconv.ParseFloat(cfg.CPU, 64)
  621. usesCustom = true
  622. if err != nil || math.IsNaN(cpuCost) || math.IsInf(cpuCost, 0) {
  623. cpuCost = 0
  624. }
  625. }
  626. ramCost, err := strconv.ParseFloat(ramCostStr, 64)
  627. if err != nil || math.IsNaN(ramCost) || math.IsInf(ramCost, 0) || ramCost == 0 {
  628. ramCost, err = strconv.ParseFloat(cfg.RAM, 64)
  629. usesCustom = true
  630. if err != nil || math.IsNaN(ramCost) || math.IsInf(ramCost, 0) {
  631. ramCost = 0
  632. }
  633. }
  634. gpuCost, err := strconv.ParseFloat(gpuCostStr, 64)
  635. if err != nil || math.IsNaN(gpuCost) || math.IsInf(gpuCost, 0) {
  636. gpuCost, err = strconv.ParseFloat(cfg.GPU, 64)
  637. if err != nil || math.IsNaN(gpuCost) || math.IsInf(gpuCost, 0) {
  638. gpuCost = 0
  639. }
  640. }
  641. pvCost, err := strconv.ParseFloat(pvCostStr, 64)
  642. if err != nil || math.IsNaN(cpuCost) || math.IsInf(cpuCost, 0) {
  643. pvCost, err = strconv.ParseFloat(cfg.Storage, 64)
  644. if err != nil || math.IsNaN(pvCost) || math.IsInf(pvCost, 0) {
  645. pvCost = 0
  646. }
  647. }
  648. return cpuCost, ramCost, gpuCost, pvCost, usesCustom
  649. }
  650. func getPriceVectors(cp cloud.Provider, costDatum *CostData, rate string, discount float64, customDiscount float64, idleCoefficient float64) ([]*util.Vector, []*util.Vector, []*util.Vector, [][]*util.Vector, []*util.Vector) {
  651. var cpuCost float64
  652. var ramCost float64
  653. var gpuCost float64
  654. var pvCost float64
  655. var usesCustom bool
  656. // If custom pricing is enabled and can be retrieved, replace
  657. // default cost values with custom values
  658. customPricing, err := cp.GetConfig()
  659. if err != nil {
  660. klog.Errorf("failed to load custom pricing: %s", err)
  661. }
  662. if cloud.CustomPricesEnabled(cp) && err == nil {
  663. var cpuCostStr string
  664. var ramCostStr string
  665. var gpuCostStr string
  666. var pvCostStr string
  667. if costDatum.NodeData.IsSpot() {
  668. cpuCostStr = customPricing.SpotCPU
  669. ramCostStr = customPricing.SpotRAM
  670. gpuCostStr = customPricing.SpotGPU
  671. } else {
  672. cpuCostStr = customPricing.CPU
  673. ramCostStr = customPricing.RAM
  674. gpuCostStr = customPricing.GPU
  675. }
  676. pvCostStr = customPricing.Storage
  677. cpuCost, ramCost, gpuCost, pvCost, usesCustom = parseVectorPricing(customPricing, costDatum, cpuCostStr, ramCostStr, gpuCostStr, pvCostStr)
  678. } else if costDatum.NodeData == nil && err == nil {
  679. cpuCostStr := customPricing.CPU
  680. ramCostStr := customPricing.RAM
  681. gpuCostStr := customPricing.GPU
  682. pvCostStr := customPricing.Storage
  683. cpuCost, ramCost, gpuCost, pvCost, usesCustom = parseVectorPricing(customPricing, costDatum, cpuCostStr, ramCostStr, gpuCostStr, pvCostStr)
  684. } else {
  685. cpuCostStr := costDatum.NodeData.VCPUCost
  686. ramCostStr := costDatum.NodeData.RAMCost
  687. gpuCostStr := costDatum.NodeData.GPUCost
  688. pvCostStr := costDatum.NodeData.StorageCost
  689. cpuCost, ramCost, gpuCost, pvCost, usesCustom = parseVectorPricing(customPricing, costDatum, cpuCostStr, ramCostStr, gpuCostStr, pvCostStr)
  690. }
  691. if usesCustom {
  692. log.DedupedWarningf(5, "No pricing data found for node `%s` , using custom pricing", costDatum.NodeName)
  693. }
  694. cpuDiscount, ramDiscount := getDiscounts(costDatum, cpuCost, ramCost, discount)
  695. klog.V(4).Infof("Node Name: %s", costDatum.NodeName)
  696. klog.V(4).Infof("Blended CPU Discount: %f", cpuDiscount)
  697. klog.V(4).Infof("Blended RAM Discount: %f", ramDiscount)
  698. // TODO should we try to apply the rate coefficient here or leave it as a totals-only metric?
  699. rateCoeff := 1.0
  700. if idleCoefficient == 0 {
  701. idleCoefficient = 1.0
  702. }
  703. cpuv := make([]*util.Vector, 0, len(costDatum.CPUAllocation))
  704. for _, val := range costDatum.CPUAllocation {
  705. cpuv = append(cpuv, &util.Vector{
  706. Timestamp: math.Round(val.Timestamp/10) * 10,
  707. Value: (val.Value * cpuCost * (1 - cpuDiscount) * (1 - customDiscount) / idleCoefficient) * rateCoeff,
  708. })
  709. }
  710. ramv := make([]*util.Vector, 0, len(costDatum.RAMAllocation))
  711. for _, val := range costDatum.RAMAllocation {
  712. ramv = append(ramv, &util.Vector{
  713. Timestamp: math.Round(val.Timestamp/10) * 10,
  714. Value: ((val.Value / 1024 / 1024 / 1024) * ramCost * (1 - ramDiscount) * (1 - customDiscount) / idleCoefficient) * rateCoeff,
  715. })
  716. }
  717. gpuv := make([]*util.Vector, 0, len(costDatum.GPUReq))
  718. for _, val := range costDatum.GPUReq {
  719. gpuv = append(gpuv, &util.Vector{
  720. Timestamp: math.Round(val.Timestamp/10) * 10,
  721. Value: (val.Value * gpuCost * (1 - discount) * (1 - customDiscount) / idleCoefficient) * rateCoeff,
  722. })
  723. }
  724. pvvs := make([][]*util.Vector, 0, len(costDatum.PVCData))
  725. for _, pvcData := range costDatum.PVCData {
  726. pvv := make([]*util.Vector, 0, len(pvcData.Values))
  727. if pvcData.Volume != nil {
  728. cost, _ := strconv.ParseFloat(pvcData.Volume.Cost, 64)
  729. // override with custom pricing if enabled
  730. if cloud.CustomPricesEnabled(cp) {
  731. cost = pvCost
  732. }
  733. for _, val := range pvcData.Values {
  734. pvv = append(pvv, &util.Vector{
  735. Timestamp: math.Round(val.Timestamp/10) * 10,
  736. Value: ((val.Value / 1024 / 1024 / 1024) * cost * (1 - customDiscount) / idleCoefficient) * rateCoeff,
  737. })
  738. }
  739. pvvs = append(pvvs, pvv)
  740. }
  741. }
  742. netv := make([]*util.Vector, 0, len(costDatum.NetworkData))
  743. for _, val := range costDatum.NetworkData {
  744. netv = append(netv, &util.Vector{
  745. Timestamp: math.Round(val.Timestamp/10) * 10,
  746. Value: val.Value,
  747. })
  748. }
  749. return cpuv, ramv, gpuv, pvvs, netv
  750. }
  751. func averageVectors(vectors []*util.Vector) float64 {
  752. if len(vectors) == 0 {
  753. return 0.0
  754. }
  755. return totalVectors(vectors) / float64(len(vectors))
  756. }
  757. func totalVectors(vectors []*util.Vector) float64 {
  758. total := 0.0
  759. for _, vector := range vectors {
  760. total += vector.Value
  761. }
  762. return total
  763. }
  764. // addVectors adds two slices of Vectors. Vector timestamps are rounded to the
  765. // nearest ten seconds to allow matching of Vectors within a delta allowance.
  766. // Matching Vectors are summed, while unmatched Vectors are passed through.
  767. // e.g. [(t=1, 1), (t=2, 2)] + [(t=2, 2), (t=3, 3)] = [(t=1, 1), (t=2, 4), (t=3, 3)]
  768. func addVectors(xvs []*util.Vector, yvs []*util.Vector) []*util.Vector {
  769. sumOp := func(result *util.Vector, x *float64, y *float64) bool {
  770. if x != nil && y != nil {
  771. result.Value = *x + *y
  772. } else if y != nil {
  773. result.Value = *y
  774. } else if x != nil {
  775. result.Value = *x
  776. }
  777. return true
  778. }
  779. return util.ApplyVectorOp(xvs, yvs, sumOp)
  780. }
  781. // minCostDataLength sets the minimum number of time series data required to
  782. // cache both raw and aggregated cost data
  783. const minCostDataLength = 2
  784. // EmptyDataError describes an error caused by empty cost data for some
  785. // defined interval
  786. type EmptyDataError struct {
  787. err error
  788. duration string
  789. offset string
  790. }
  791. // Error implements the error interface
  792. func (ede *EmptyDataError) Error() string {
  793. err := fmt.Sprintf("empty data for range: %s", ede.duration)
  794. if ede.offset != "" {
  795. err += fmt.Sprintf(" offset %s", ede.offset)
  796. }
  797. if ede.err != nil {
  798. err += fmt.Sprintf(": %s", ede.err)
  799. }
  800. return err
  801. }
  802. func costDataTimeSeriesLength(costData map[string]*CostData) int {
  803. l := 0
  804. for _, cd := range costData {
  805. if l < len(cd.RAMAllocation) {
  806. l = len(cd.RAMAllocation)
  807. }
  808. if l < len(cd.CPUAllocation) {
  809. l = len(cd.CPUAllocation)
  810. }
  811. }
  812. return l
  813. }
  814. // ScaleHourlyCostData converts per-hour cost data to per-resolution data. If the target resolution is higher (i.e. < 1.0h)
  815. // then we can do simple multiplication by the fraction-of-an-hour and retain accuracy. If the target resolution is
  816. // lower (i.e. > 1.0h) then we sum groups of hourly data by resolution to maintain fidelity.
  817. // e.g. (100 hours of per-hour hourly data, resolutionHours=10) => 10 data points, grouped and summed by 10-hour window
  818. // e.g. (20 minutes of per-minute hourly data, resolutionHours=1/60) => 20 data points, scaled down by a factor of 60
  819. func ScaleHourlyCostData(data map[string]*CostData, resolutionHours float64) map[string]*CostData {
  820. scaled := map[string]*CostData{}
  821. for key, datum := range data {
  822. datum.RAMReq = scaleVectorSeries(datum.RAMReq, resolutionHours)
  823. datum.RAMUsed = scaleVectorSeries(datum.RAMUsed, resolutionHours)
  824. datum.RAMAllocation = scaleVectorSeries(datum.RAMAllocation, resolutionHours)
  825. datum.CPUReq = scaleVectorSeries(datum.CPUReq, resolutionHours)
  826. datum.CPUUsed = scaleVectorSeries(datum.CPUUsed, resolutionHours)
  827. datum.CPUAllocation = scaleVectorSeries(datum.CPUAllocation, resolutionHours)
  828. datum.GPUReq = scaleVectorSeries(datum.GPUReq, resolutionHours)
  829. datum.NetworkData = scaleVectorSeries(datum.NetworkData, resolutionHours)
  830. for _, pvcDatum := range datum.PVCData {
  831. pvcDatum.Values = scaleVectorSeries(pvcDatum.Values, resolutionHours)
  832. }
  833. scaled[key] = datum
  834. }
  835. return scaled
  836. }
  837. func scaleVectorSeries(vs []*util.Vector, resolutionHours float64) []*util.Vector {
  838. // if scaling to a lower resolution, compress the hourly data for maximum accuracy
  839. if resolutionHours > 1.0 {
  840. return compressVectorSeries(vs, resolutionHours)
  841. }
  842. // if scaling to a higher resolution, simply scale each value down by the fraction of an hour
  843. for _, v := range vs {
  844. v.Value *= resolutionHours
  845. }
  846. return vs
  847. }
  848. func compressVectorSeries(vs []*util.Vector, resolutionHours float64) []*util.Vector {
  849. if len(vs) == 0 {
  850. return vs
  851. }
  852. compressed := []*util.Vector{}
  853. threshold := float64(60 * 60 * resolutionHours)
  854. var acc *util.Vector
  855. for i, v := range vs {
  856. if acc == nil {
  857. // start a new accumulation from current datum
  858. acc = &util.Vector{
  859. Value: vs[i].Value,
  860. Timestamp: vs[i].Timestamp,
  861. }
  862. continue
  863. }
  864. if v.Timestamp-acc.Timestamp < threshold {
  865. // v should be accumulated in current datum
  866. acc.Value += v.Value
  867. } else {
  868. // v falls outside current datum's threshold; append and start a new one
  869. compressed = append(compressed, acc)
  870. acc = &util.Vector{
  871. Value: vs[i].Value,
  872. Timestamp: vs[i].Timestamp,
  873. }
  874. }
  875. }
  876. // append any remaining, incomplete accumulation
  877. if acc != nil {
  878. compressed = append(compressed, acc)
  879. }
  880. return compressed
  881. }
  882. // ComputeAggregateCostModel computes cost data for the given window, then aggregates it by the given fields.
  883. // Data is cached on two levels: the aggregation is cached as well as the underlying cost data.
  884. func (a *Accesses) ComputeAggregateCostModel(promClient prometheusClient.Client, duration, offset, field string, subfields []string, rate string, filters map[string]string,
  885. sri *SharedResourceInfo, shared string, allocateIdle, includeTimeSeries, includeEfficiency, disableCache, clearCache, noCache, noExpireCache, remoteEnabled, disableSharedOverhead bool) (map[string]*Aggregation, string, error) {
  886. profileBaseName := fmt.Sprintf("ComputeAggregateCostModel(duration=%s, offet=%s, field=%s)", duration, offset, field)
  887. defer measureTime(time.Now(), profileThreshold, profileBaseName)
  888. // parse cost data filters into FilterFuncs
  889. filterFuncs := []FilterFunc{}
  890. retainFuncs := []FilterFunc{}
  891. retainFuncs = append(retainFuncs, func(cd *CostData) (bool, string) {
  892. if sri != nil {
  893. return sri.IsSharedResource(cd), ""
  894. }
  895. return false, ""
  896. })
  897. aggregateEnvironment := func(costDatum *CostData) string {
  898. if field == "cluster" {
  899. return costDatum.ClusterID
  900. } else if field == "node" {
  901. return costDatum.NodeName
  902. } else if field == "namespace" {
  903. return costDatum.Namespace
  904. } else if field == "service" {
  905. if len(costDatum.Services) > 0 {
  906. return costDatum.Namespace + "/" + costDatum.Services[0]
  907. }
  908. } else if field == "deployment" {
  909. if len(costDatum.Deployments) > 0 {
  910. return costDatum.Namespace + "/" + costDatum.Deployments[0]
  911. }
  912. } else if field == "daemonset" {
  913. if len(costDatum.Daemonsets) > 0 {
  914. return costDatum.Namespace + "/" + costDatum.Daemonsets[0]
  915. }
  916. } else if field == "statefulset" {
  917. if len(costDatum.Statefulsets) > 0 {
  918. return costDatum.Namespace + "/" + costDatum.Statefulsets[0]
  919. }
  920. } else if field == "label" {
  921. if costDatum.Labels != nil {
  922. for _, sf := range subfields {
  923. if subfieldName, ok := costDatum.Labels[sf]; ok {
  924. return fmt.Sprintf("%s=%s", sf, subfieldName)
  925. }
  926. }
  927. }
  928. } else if field == "pod" {
  929. return costDatum.Namespace + "/" + costDatum.PodName
  930. } else if field == "container" {
  931. return costDatum.Namespace + "/" + costDatum.PodName + "/" + costDatum.Name
  932. }
  933. return ""
  934. }
  935. if filters["podprefix"] != "" {
  936. pps := []string{}
  937. for _, fp := range strings.Split(filters["podprefix"], ",") {
  938. if fp != "" {
  939. cleanedFilter := strings.TrimSpace(fp)
  940. pps = append(pps, cleanedFilter)
  941. }
  942. }
  943. filterFuncs = append(filterFuncs, func(cd *CostData) (bool, string) {
  944. aggEnv := aggregateEnvironment(cd)
  945. for _, pp := range pps {
  946. cleanedFilter := strings.TrimSpace(pp)
  947. if strings.HasPrefix(cd.PodName, cleanedFilter) {
  948. return true, aggEnv
  949. }
  950. }
  951. return false, aggEnv
  952. })
  953. }
  954. if filters["namespace"] != "" {
  955. // namespaces may be comma-separated, e.g. kubecost,default
  956. // multiple namespaces are evaluated as an OR relationship
  957. nss := strings.Split(filters["namespace"], ",")
  958. filterFuncs = append(filterFuncs, func(cd *CostData) (bool, string) {
  959. aggEnv := aggregateEnvironment(cd)
  960. for _, ns := range nss {
  961. nsTrim := strings.TrimSpace(ns)
  962. if cd.Namespace == nsTrim {
  963. return true, aggEnv
  964. } else if strings.HasSuffix(nsTrim, "*") { // trigger wildcard prefix filtering
  965. nsTrimAsterisk := strings.TrimSuffix(nsTrim, "*")
  966. if strings.HasPrefix(cd.Namespace, nsTrimAsterisk) {
  967. return true, aggEnv
  968. }
  969. }
  970. }
  971. return false, aggEnv
  972. })
  973. }
  974. if filters["node"] != "" {
  975. // nodes may be comma-separated, e.g. aws-node-1,aws-node-2
  976. // multiple nodes are evaluated as an OR relationship
  977. nodes := strings.Split(filters["node"], ",")
  978. filterFuncs = append(filterFuncs, func(cd *CostData) (bool, string) {
  979. aggEnv := aggregateEnvironment(cd)
  980. for _, node := range nodes {
  981. nodeTrim := strings.TrimSpace(node)
  982. if cd.NodeName == nodeTrim {
  983. return true, aggEnv
  984. } else if strings.HasSuffix(nodeTrim, "*") { // trigger wildcard prefix filtering
  985. nodeTrimAsterisk := strings.TrimSuffix(nodeTrim, "*")
  986. if strings.HasPrefix(cd.NodeName, nodeTrimAsterisk) {
  987. return true, aggEnv
  988. }
  989. }
  990. }
  991. return false, aggEnv
  992. })
  993. }
  994. if filters["cluster"] != "" {
  995. // clusters may be comma-separated, e.g. cluster-one,cluster-two
  996. // multiple clusters are evaluated as an OR relationship
  997. cs := strings.Split(filters["cluster"], ",")
  998. filterFuncs = append(filterFuncs, func(cd *CostData) (bool, string) {
  999. aggEnv := aggregateEnvironment(cd)
  1000. for _, c := range cs {
  1001. cTrim := strings.TrimSpace(c)
  1002. id, name := cd.ClusterID, cd.ClusterName
  1003. if id == cTrim || name == cTrim {
  1004. return true, aggEnv
  1005. } else if strings.HasSuffix(cTrim, "*") { // trigger wildcard prefix filtering
  1006. cTrimAsterisk := strings.TrimSuffix(cTrim, "*")
  1007. if strings.HasPrefix(id, cTrimAsterisk) || strings.HasPrefix(name, cTrimAsterisk) {
  1008. return true, aggEnv
  1009. }
  1010. }
  1011. }
  1012. return false, aggEnv
  1013. })
  1014. }
  1015. if filters["labels"] != "" {
  1016. // labels are expected to be comma-separated and to take the form key=value
  1017. // e.g. app=cost-analyzer,app.kubernetes.io/instance=kubecost
  1018. // each different label will be applied as an AND
  1019. // multiple values for a single label will be evaluated as an OR
  1020. labelValues := map[string][]string{}
  1021. ls := strings.Split(filters["labels"], ",")
  1022. for _, l := range ls {
  1023. lTrim := strings.TrimSpace(l)
  1024. label := strings.Split(lTrim, "=")
  1025. if len(label) == 2 {
  1026. ln := SanitizeLabelName(strings.TrimSpace(label[0]))
  1027. lv := strings.TrimSpace(label[1])
  1028. labelValues[ln] = append(labelValues[ln], lv)
  1029. } else {
  1030. // label is not of the form name=value, so log it and move on
  1031. klog.V(2).Infof("[Warning] aggregate cost model: skipping illegal label filter: %s", l)
  1032. }
  1033. }
  1034. // Generate FilterFunc for each set of label filters by invoking a function instead of accessing
  1035. // values by closure to prevent reference-type looping bug.
  1036. // (see https://github.com/golang/go/wiki/CommonMistakes#using-reference-to-loop-iterator-variable)
  1037. for label, values := range labelValues {
  1038. ff := (func(l string, vs []string) FilterFunc {
  1039. return func(cd *CostData) (bool, string) {
  1040. ae := aggregateEnvironment(cd)
  1041. for _, v := range vs {
  1042. if v == "__unallocated__" { // Special case. __unallocated__ means return all pods without the attached label
  1043. if _, ok := cd.Labels[label]; !ok {
  1044. return true, ae
  1045. }
  1046. }
  1047. if cd.Labels[label] == v {
  1048. return true, ae
  1049. } else if strings.HasSuffix(v, "*") { // trigger wildcard prefix filtering
  1050. vTrim := strings.TrimSuffix(v, "*")
  1051. if strings.HasPrefix(cd.Labels[label], vTrim) {
  1052. return true, ae
  1053. }
  1054. }
  1055. }
  1056. return false, ae
  1057. }
  1058. })(label, values)
  1059. filterFuncs = append(filterFuncs, ff)
  1060. }
  1061. }
  1062. // clear cache prior to checking the cache so that a clearCache=true
  1063. // request always returns a freshly computed value
  1064. if clearCache {
  1065. a.AggregateCache.Flush()
  1066. a.CostDataCache.Flush()
  1067. }
  1068. cacheExpiry := a.GetCacheExpiration(duration)
  1069. if noExpireCache {
  1070. cacheExpiry = cache.NoExpiration
  1071. }
  1072. // parametrize cache key by all request parameters
  1073. aggKey := GenerateAggKey(aggKeyParams{
  1074. duration: duration,
  1075. offset: offset,
  1076. filters: filters,
  1077. field: field,
  1078. subfields: subfields,
  1079. rate: rate,
  1080. sri: sri,
  1081. shareType: shared,
  1082. idle: allocateIdle,
  1083. timeSeries: includeTimeSeries,
  1084. efficiency: includeEfficiency,
  1085. })
  1086. // convert duration and offset to start and end times
  1087. startTime, endTime, err := ParseTimeRange(duration, offset)
  1088. if err != nil {
  1089. return nil, "", fmt.Errorf("Error parsing duration (%s) and offset (%s): %s", duration, offset, err)
  1090. }
  1091. durationHours := endTime.Sub(*startTime).Hours()
  1092. thanosOffset := time.Now().Add(-thanos.OffsetDuration())
  1093. if a.ThanosClient != nil && endTime.After(thanosOffset) {
  1094. klog.V(4).Infof("Setting end time backwards to first present data")
  1095. // Apply offsets to both end and start times to maintain correct time range
  1096. deltaDuration := endTime.Sub(thanosOffset)
  1097. *startTime = startTime.Add(-1 * deltaDuration)
  1098. *endTime = time.Now().Add(-thanos.OffsetDuration())
  1099. }
  1100. // determine resolution by size of duration
  1101. resolutionHours := durationHours
  1102. if durationHours >= 2160 {
  1103. // 90 days
  1104. resolutionHours = 72.0
  1105. } else if durationHours >= 720 {
  1106. // 30 days
  1107. resolutionHours = 24.0
  1108. } else if durationHours >= 168 {
  1109. // 7 days
  1110. resolutionHours = 24.0
  1111. } else if durationHours >= 48 {
  1112. // 2 days
  1113. resolutionHours = 2.0
  1114. } else if durationHours >= 1 {
  1115. resolutionHours = 1.0
  1116. }
  1117. key := fmt.Sprintf(`%s:%s:%fh:%t`, duration, offset, resolutionHours, remoteEnabled)
  1118. // report message about which of the two caches hit. by default report a miss
  1119. cacheMessage := fmt.Sprintf("L1 cache miss: %s L2 cache miss: %s", aggKey, key)
  1120. // check the cache for aggregated response; if cache is hit and not disabled, return response
  1121. if value, found := a.AggregateCache.Get(aggKey); found && !disableCache && !noCache {
  1122. result, ok := value.(map[string]*Aggregation)
  1123. if !ok {
  1124. // disable cache and recompute if type cast fails
  1125. klog.Errorf("caching error: failed to cast aggregate data to struct: %s", aggKey)
  1126. return a.ComputeAggregateCostModel(promClient, duration, offset, field, subfields, rate, filters,
  1127. sri, shared, allocateIdle, includeTimeSeries, includeEfficiency, true, false, noExpireCache, noCache, remoteEnabled, disableSharedOverhead)
  1128. }
  1129. return result, fmt.Sprintf("aggregate cache hit: %s", aggKey), nil
  1130. }
  1131. profileStart := time.Now()
  1132. profileName := profileBaseName + ": "
  1133. window := duration
  1134. if durationHours >= 1 {
  1135. window = fmt.Sprintf("%dh", int(resolutionHours))
  1136. // exclude the last window of the time frame to match Prometheus definitions of range, offset, and resolution
  1137. *startTime = startTime.Add(time.Duration(resolutionHours) * time.Hour)
  1138. } else {
  1139. // don't cache requests for durations of less than one hour
  1140. klog.Infof("key %s has durationhours %f", key, durationHours)
  1141. disableCache = true
  1142. }
  1143. profileBaseName = fmt.Sprintf("ComputeAggregateCostModel(duration=%s, offset=%s, field=%s, window=%s)", duration, offset, field, window)
  1144. // attempt to retrieve cost data from cache
  1145. var costData map[string]*CostData
  1146. cacheData, found := a.CostDataCache.Get(key)
  1147. if found && !disableCache && !noCache {
  1148. profileName += "get cost data from cache"
  1149. ok := false
  1150. costData, ok = cacheData.(map[string]*CostData)
  1151. cacheMessage = fmt.Sprintf("L1 cache miss: %s, L2 cost data cache hit: %s", aggKey, key)
  1152. if !ok {
  1153. klog.Errorf("caching error: failed to cast cost data to struct: %s", key)
  1154. }
  1155. } else {
  1156. klog.Infof("key %s missed cache. found %t, disableCache %t, noCache %t ", key, found, disableCache, noCache)
  1157. cv := a.CostDataCache.Items()
  1158. klog.V(3).Infof("Logging cache items...")
  1159. for k := range cv {
  1160. klog.V(3).Infof("Cache item: %s", k)
  1161. }
  1162. profileName += "compute cost data"
  1163. start := startTime.Format(RFC3339Milli)
  1164. end := endTime.Format(RFC3339Milli)
  1165. costData, err = a.Model.ComputeCostDataRange(promClient, a.KubeClientSet, a.CloudProvider, start, end, window, resolutionHours, "", "", remoteEnabled, offset)
  1166. if err != nil {
  1167. if pce, ok := err.(prom.CommError); ok {
  1168. return nil, "", pce
  1169. }
  1170. if strings.Contains(err.Error(), "data is empty") {
  1171. return nil, "", &EmptyDataError{err: err, duration: duration, offset: offset}
  1172. }
  1173. return nil, "", err
  1174. }
  1175. // compute length of the time series in the cost data and only compute
  1176. // aggregates and cache if the length is sufficiently high
  1177. costDataLen := costDataTimeSeriesLength(costData)
  1178. if durationHours < 1.0 {
  1179. // scale hourly cost data down to fractional hour
  1180. costData = ScaleHourlyCostData(costData, resolutionHours)
  1181. }
  1182. if costDataLen == 0 {
  1183. return nil, "", &EmptyDataError{duration: duration, offset: offset}
  1184. }
  1185. if costDataLen >= minCostDataLength && !noCache {
  1186. klog.Infof("Setting L2 cache: %s", key)
  1187. a.CostDataCache.Set(key, costData, cacheExpiry)
  1188. }
  1189. }
  1190. measureTime(profileStart, profileThreshold, profileName)
  1191. c, err := a.CloudProvider.GetConfig()
  1192. if err != nil {
  1193. return nil, "", err
  1194. }
  1195. discount, err := ParsePercentString(c.Discount)
  1196. if err != nil {
  1197. return nil, "", err
  1198. }
  1199. customDiscount, err := ParsePercentString(c.NegotiatedDiscount)
  1200. if err != nil {
  1201. return nil, "", err
  1202. }
  1203. sc := make(map[string]*SharedCostInfo)
  1204. if !disableSharedOverhead {
  1205. for key, val := range c.SharedCosts {
  1206. cost, err := strconv.ParseFloat(val, 64)
  1207. durationCoefficient := durationHours / util.HoursPerMonth
  1208. if err != nil {
  1209. return nil, "", fmt.Errorf("Unable to parse shared cost %s: %s", val, err.Error())
  1210. }
  1211. sc[key] = &SharedCostInfo{
  1212. Name: key,
  1213. Cost: cost * durationCoefficient,
  1214. }
  1215. }
  1216. }
  1217. profileStart = time.Now()
  1218. profileName = profileBaseName + ": compute idle coefficient"
  1219. idleCoefficients := make(map[string]float64)
  1220. if allocateIdle {
  1221. idleDurationCalcHours := durationHours
  1222. if durationHours < 1 {
  1223. idleDurationCalcHours = 1
  1224. }
  1225. windowStr := fmt.Sprintf("%dh", int(idleDurationCalcHours))
  1226. if a.ThanosClient != nil {
  1227. offset = thanos.Offset()
  1228. klog.Infof("Setting offset to %s", offset)
  1229. }
  1230. idleCoefficients, err = a.ComputeIdleCoefficient(costData, promClient, a.CloudProvider, discount, customDiscount, windowStr, offset)
  1231. if err != nil {
  1232. klog.Errorf("error computing idle coefficient: windowString=%s, offset=%s, err=%s", windowStr, offset, err)
  1233. return nil, "", err
  1234. }
  1235. }
  1236. for cid, idleCoefficient := range idleCoefficients {
  1237. klog.Infof("Idle Coeff: %s: %f", cid, idleCoefficient)
  1238. }
  1239. totalContainerCost := 0.0
  1240. if shared == SplitTypeWeighted {
  1241. totalContainerCost = GetTotalContainerCost(costData, rate, a.CloudProvider, discount, customDiscount, idleCoefficients)
  1242. }
  1243. measureTime(profileStart, profileThreshold, profileName)
  1244. profileStart = time.Now()
  1245. profileName = profileBaseName + ": filter cost data"
  1246. // filter cost data by namespace and cluster after caching for maximal cache hits
  1247. costData, filteredContainerCount, filteredEnvironments := FilterCostData(costData, retainFuncs, filterFuncs)
  1248. measureTime(profileStart, profileThreshold, profileName)
  1249. profileStart = time.Now()
  1250. profileName = profileBaseName + ": aggregate cost data"
  1251. // aggregate cost model data by given fields and cache the result for the default expiration
  1252. opts := &AggregationOptions{
  1253. Discount: discount,
  1254. CustomDiscount: customDiscount,
  1255. IdleCoefficients: idleCoefficients,
  1256. IncludeEfficiency: includeEfficiency,
  1257. IncludeTimeSeries: includeTimeSeries,
  1258. Rate: rate,
  1259. ResolutionHours: resolutionHours,
  1260. SharedResourceInfo: sri,
  1261. SharedCosts: sc,
  1262. FilteredContainerCount: filteredContainerCount,
  1263. FilteredEnvironments: filteredEnvironments,
  1264. TotalContainerCost: totalContainerCost,
  1265. SharedSplit: shared,
  1266. }
  1267. result := AggregateCostData(costData, field, subfields, a.CloudProvider, opts)
  1268. // If sending time series data back, switch scale back to hourly data. At this point,
  1269. // resolutionHours may have converted our hourly data to more- or less-than hourly data.
  1270. if includeTimeSeries {
  1271. for _, aggs := range result {
  1272. ScaleAggregationTimeSeries(aggs, resolutionHours)
  1273. }
  1274. }
  1275. // compute length of the time series in the cost data and only cache
  1276. // aggregation results if the length is sufficiently high
  1277. costDataLen := costDataTimeSeriesLength(costData)
  1278. if costDataLen >= minCostDataLength && durationHours > 1 && !noCache {
  1279. // Set the result map (rather than a pointer to it) because map is a reference type
  1280. klog.Infof("Caching key in aggregate cache: %s", key)
  1281. a.AggregateCache.Set(aggKey, result, cacheExpiry)
  1282. } else {
  1283. klog.Infof("Not caching for key %s. Not enough data: %t, Duration less than 1h: %t, noCache: %t", key, costDataLen < minCostDataLength, durationHours < 1, noCache)
  1284. }
  1285. measureTime(profileStart, profileThreshold, profileName)
  1286. return result, cacheMessage, nil
  1287. }
  1288. // ScaleAggregationTimeSeries reverses the scaling done by ScaleHourlyCostData, returning
  1289. // the aggregation's time series to hourly data.
  1290. func ScaleAggregationTimeSeries(aggregation *Aggregation, resolutionHours float64) {
  1291. for _, v := range aggregation.CPUCostVector {
  1292. v.Value /= resolutionHours
  1293. }
  1294. for _, v := range aggregation.GPUCostVector {
  1295. v.Value /= resolutionHours
  1296. }
  1297. for _, v := range aggregation.RAMCostVector {
  1298. v.Value /= resolutionHours
  1299. }
  1300. for _, v := range aggregation.PVCostVector {
  1301. v.Value /= resolutionHours
  1302. }
  1303. for _, v := range aggregation.NetworkCostVector {
  1304. v.Value /= resolutionHours
  1305. }
  1306. for _, v := range aggregation.TotalCostVector {
  1307. v.Value /= resolutionHours
  1308. }
  1309. return
  1310. }
  1311. // String returns a string representation of the encapsulated shared resources, which
  1312. // can be used to uniquely identify a set of shared resources. Sorting sets of shared
  1313. // resources ensures that strings representing permutations of the same combination match.
  1314. func (s *SharedResourceInfo) String() string {
  1315. if s == nil {
  1316. return ""
  1317. }
  1318. nss := []string{}
  1319. for ns := range s.SharedNamespace {
  1320. nss = append(nss, ns)
  1321. }
  1322. sort.Strings(nss)
  1323. nsStr := strings.Join(nss, ",")
  1324. labels := []string{}
  1325. for lbl, vals := range s.LabelSelectors {
  1326. for val := range vals {
  1327. if lbl != "" && val != "" {
  1328. labels = append(labels, fmt.Sprintf("%s=%s", lbl, val))
  1329. }
  1330. }
  1331. }
  1332. sort.Strings(labels)
  1333. labelStr := strings.Join(labels, ",")
  1334. return fmt.Sprintf("%s:%s", nsStr, labelStr)
  1335. }
  1336. type aggKeyParams struct {
  1337. duration string
  1338. offset string
  1339. filters map[string]string
  1340. field string
  1341. subfields []string
  1342. rate string
  1343. sri *SharedResourceInfo
  1344. shareType string
  1345. idle bool
  1346. timeSeries bool
  1347. efficiency bool
  1348. }
  1349. // GenerateAggKey generates a parameter-unique key for caching the aggregate cost model
  1350. func GenerateAggKey(ps aggKeyParams) string {
  1351. // parse, trim, and sort podprefix filters
  1352. podPrefixFilters := []string{}
  1353. if ppfs, ok := ps.filters["podprefix"]; ok && ppfs != "" {
  1354. for _, psf := range strings.Split(ppfs, ",") {
  1355. podPrefixFilters = append(podPrefixFilters, strings.TrimSpace(psf))
  1356. }
  1357. }
  1358. sort.Strings(podPrefixFilters)
  1359. podPrefixFiltersStr := strings.Join(podPrefixFilters, ",")
  1360. // parse, trim, and sort namespace filters
  1361. nsFilters := []string{}
  1362. if nsfs, ok := ps.filters["namespace"]; ok && nsfs != "" {
  1363. for _, nsf := range strings.Split(nsfs, ",") {
  1364. nsFilters = append(nsFilters, strings.TrimSpace(nsf))
  1365. }
  1366. }
  1367. sort.Strings(nsFilters)
  1368. nsFilterStr := strings.Join(nsFilters, ",")
  1369. // parse, trim, and sort node filters
  1370. nodeFilters := []string{}
  1371. if nodefs, ok := ps.filters["node"]; ok && nodefs != "" {
  1372. for _, nodef := range strings.Split(nodefs, ",") {
  1373. nodeFilters = append(nodeFilters, strings.TrimSpace(nodef))
  1374. }
  1375. }
  1376. sort.Strings(nodeFilters)
  1377. nodeFilterStr := strings.Join(nodeFilters, ",")
  1378. // parse, trim, and sort cluster filters
  1379. cFilters := []string{}
  1380. if cfs, ok := ps.filters["cluster"]; ok && cfs != "" {
  1381. for _, cf := range strings.Split(cfs, ",") {
  1382. cFilters = append(cFilters, strings.TrimSpace(cf))
  1383. }
  1384. }
  1385. sort.Strings(cFilters)
  1386. cFilterStr := strings.Join(cFilters, ",")
  1387. // parse, trim, and sort label filters
  1388. lFilters := []string{}
  1389. if lfs, ok := ps.filters["labels"]; ok && lfs != "" {
  1390. for _, lf := range strings.Split(lfs, ",") {
  1391. // trim whitespace from the label name and the label value
  1392. // of each label name/value pair, then reconstruct
  1393. // e.g. "tier = frontend, app = kubecost" == "app=kubecost,tier=frontend"
  1394. lfa := strings.Split(lf, "=")
  1395. if len(lfa) == 2 {
  1396. lfn := strings.TrimSpace(lfa[0])
  1397. lfv := strings.TrimSpace(lfa[1])
  1398. lFilters = append(lFilters, fmt.Sprintf("%s=%s", lfn, lfv))
  1399. } else {
  1400. // label is not of the form name=value, so log it and move on
  1401. klog.V(2).Infof("[Warning] GenerateAggKey: skipping illegal label filter: %s", lf)
  1402. }
  1403. }
  1404. }
  1405. sort.Strings(lFilters)
  1406. lFilterStr := strings.Join(lFilters, ",")
  1407. filterStr := fmt.Sprintf("%s:%s:%s:%s:%s", nsFilterStr, nodeFilterStr, cFilterStr, lFilterStr, podPrefixFiltersStr)
  1408. sort.Strings(ps.subfields)
  1409. fieldStr := fmt.Sprintf("%s:%s", ps.field, strings.Join(ps.subfields, ","))
  1410. return fmt.Sprintf("%s:%s:%s:%s:%s:%s:%s:%t:%t:%t", ps.duration, ps.offset, filterStr, fieldStr, ps.rate,
  1411. ps.sri, ps.shareType, ps.idle, ps.timeSeries, ps.efficiency)
  1412. }
  1413. // AggregateCostModelHandler handles requests to the aggregated cost model API. See
  1414. // ComputeAggregateCostModel for details.
  1415. func (a *Accesses) AggregateCostModelHandler(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
  1416. w.Header().Set("Content-Type", "application/json")
  1417. windowStr := r.URL.Query().Get("window")
  1418. // Convert UTC-RFC3339 pairs to configured UTC offset
  1419. // e.g. with UTC offset of -0600, 2020-07-01T00:00:00Z becomes
  1420. // 2020-07-01T06:00:00Z == 2020-07-01T00:00:00-0600
  1421. // TODO niko/etl fix the frontend because this is confusing if you're
  1422. // actually asking for UTC time (...Z) and we swap that "Z" out for the
  1423. // configured UTC offset without asking
  1424. rfc3339 := `\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ`
  1425. regex := regexp.MustCompile(fmt.Sprintf(`(%s),(%s)`, rfc3339, rfc3339))
  1426. match := regex.FindStringSubmatch(windowStr)
  1427. if match != nil {
  1428. start, _ := time.Parse(time.RFC3339, match[1])
  1429. start = start.Add(-env.GetParsedUTCOffset()).In(time.UTC)
  1430. end, _ := time.Parse(time.RFC3339, match[2])
  1431. end = end.Add(-env.GetParsedUTCOffset()).In(time.UTC)
  1432. windowStr = fmt.Sprintf("%sZ,%sZ", start.Format("2006-01-02T15:04:05"), end.Format("2006-01-02T15:04:05Z"))
  1433. }
  1434. // determine duration and offset from query parameters
  1435. window, err := kubecost.ParseWindowWithOffset(windowStr, env.GetParsedUTCOffset())
  1436. if err != nil || window.Start() == nil {
  1437. http.Error(w, fmt.Sprintf("invalid window: %s", err), http.StatusBadRequest)
  1438. return
  1439. }
  1440. duration, offset := window.ToDurationOffset()
  1441. durRegex := regexp.MustCompile(`^(\d+)(m|h|d|s)$`)
  1442. isDurationStr := durRegex.MatchString(windowStr)
  1443. // legacy offset option should override window offset
  1444. if r.URL.Query().Get("offset") != "" {
  1445. offset = r.URL.Query().Get("offset")
  1446. // Shift window by offset, but only when manually set with separate
  1447. // parameter and window was provided as a duration string. Otherwise,
  1448. // do not alter the (duration, offset) from ParseWindowWithOffset.
  1449. if offset != "1m" && isDurationStr {
  1450. match := durRegex.FindStringSubmatch(offset)
  1451. if match != nil && len(match) == 3 {
  1452. dur := time.Minute
  1453. if match[2] == "h" {
  1454. dur = time.Hour
  1455. }
  1456. if match[2] == "d" {
  1457. dur = 24 * time.Hour
  1458. }
  1459. if match[2] == "s" {
  1460. dur = time.Second
  1461. }
  1462. num, _ := strconv.ParseInt(match[1], 10, 64)
  1463. window = window.Shift(-time.Duration(num) * dur)
  1464. }
  1465. }
  1466. }
  1467. // redirect requests with no offset to a 1m offset to improve cache hits
  1468. if offset == "" {
  1469. offset = "1m"
  1470. }
  1471. // parse remaining query parameters
  1472. namespace := r.URL.Query().Get("namespace")
  1473. cluster := r.URL.Query().Get("cluster")
  1474. labels := r.URL.Query().Get("labels")
  1475. podprefix := r.URL.Query().Get("podprefix")
  1476. labelArray := strings.Split(labels, "=")
  1477. labelArray[0] = strings.ReplaceAll(labelArray[0], "-", "_")
  1478. labels = strings.Join(labelArray, "=")
  1479. field := r.URL.Query().Get("aggregation")
  1480. subfieldStr := r.URL.Query().Get("aggregationSubfield")
  1481. rate := r.URL.Query().Get("rate")
  1482. idleFlag := r.URL.Query().Get("allocateIdle")
  1483. sharedNamespaces := r.URL.Query().Get("sharedNamespaces")
  1484. sharedLabelNames := r.URL.Query().Get("sharedLabelNames")
  1485. sharedLabelValues := r.URL.Query().Get("sharedLabelValues")
  1486. remote := r.URL.Query().Get("remote") != "false"
  1487. shared := r.URL.Query().Get("sharedSplit")
  1488. subfields := []string{}
  1489. if len(subfieldStr) > 0 {
  1490. s := strings.Split(r.URL.Query().Get("aggregationSubfield"), ",")
  1491. for _, rawLabel := range s {
  1492. subfields = append(subfields, SanitizeLabelName(rawLabel))
  1493. }
  1494. }
  1495. var allocateIdle bool
  1496. if idleFlag == "default" {
  1497. c, _ := a.CloudProvider.GetConfig()
  1498. allocateIdle = (c.DefaultIdle == "true")
  1499. } else {
  1500. allocateIdle = (idleFlag == "true")
  1501. }
  1502. // timeSeries == true maintains the time series dimension of the data,
  1503. // which by default gets summed over the entire interval
  1504. includeTimeSeries := r.URL.Query().Get("timeSeries") == "true"
  1505. // efficiency == true aggregates and returns usage and efficiency data
  1506. // includeEfficiency := r.URL.Query().Get("efficiency") == "true"
  1507. // efficiency has been deprecated in favor of a default to always send efficiency
  1508. includeEfficiency := true
  1509. // TODO niko/caching rename "recomputeCache"
  1510. // disableCache, if set to "true", tells this function to recompute and
  1511. // cache the requested data
  1512. disableCache := r.URL.Query().Get("disableCache") == "true"
  1513. // clearCache, if set to "true", tells this function to flush the cache,
  1514. // then recompute and cache the requested data
  1515. clearCache := r.URL.Query().Get("clearCache") == "true"
  1516. // noCache avoids the cache altogether, both reading from and writing to
  1517. noCache := r.URL.Query().Get("noCache") == "true"
  1518. // noExpireCache should only be used by cache warming to set non-expiring caches
  1519. noExpireCache := false
  1520. // aggregation field is required
  1521. if field == "" {
  1522. http.Error(w, "Missing aggregation field parameter", http.StatusBadRequest)
  1523. return
  1524. }
  1525. // aggregation subfield is required when aggregation field is "label"
  1526. if field == "label" && len(subfields) == 0 {
  1527. http.Error(w, "Missing aggregation subfield parameter for aggregation by label", http.StatusBadRequest)
  1528. return
  1529. }
  1530. // enforce one of four available rate options
  1531. if rate != "" && rate != "hourly" && rate != "daily" && rate != "monthly" {
  1532. http.Error(w, "If set, rate parameter must be one of: 'hourly', 'daily', 'monthly'", http.StatusBadRequest)
  1533. return
  1534. }
  1535. // parse cost data filters
  1536. // namespace and cluster are exact-string-matches
  1537. // labels are expected to be comma-separated and to take the form key=value
  1538. // e.g. app=cost-analyzer,app.kubernetes.io/instance=kubecost
  1539. filters := map[string]string{
  1540. "namespace": namespace,
  1541. "cluster": cluster,
  1542. "labels": labels,
  1543. "podprefix": podprefix,
  1544. }
  1545. // parse shared resources
  1546. sn := []string{}
  1547. sln := []string{}
  1548. slv := []string{}
  1549. if sharedNamespaces != "" {
  1550. sn = strings.Split(sharedNamespaces, ",")
  1551. }
  1552. if sharedLabelNames != "" {
  1553. sln = strings.Split(sharedLabelNames, ",")
  1554. slv = strings.Split(sharedLabelValues, ",")
  1555. if len(sln) != len(slv) || slv[0] == "" {
  1556. http.Error(w, "Supply exacly one shared label value per shared label name", http.StatusBadRequest)
  1557. return
  1558. }
  1559. }
  1560. var sr *SharedResourceInfo
  1561. if len(sn) > 0 || len(sln) > 0 {
  1562. sr = NewSharedResourceInfo(true, sn, sln, slv)
  1563. }
  1564. // enable remote if it is available and not disabled
  1565. remoteEnabled := remote && env.IsRemoteEnabled()
  1566. // if custom pricing has changed, then clear the cache and recompute data
  1567. if a.CustomPricingHasChanged() {
  1568. clearCache = true
  1569. }
  1570. promClient := a.GetPrometheusClient(remote)
  1571. var data map[string]*Aggregation
  1572. var message string
  1573. // etlEnabled := env.IsETLEnabled()
  1574. // useETLAdapter := r.URL.Query().Get("etl") == "true"
  1575. // if etlEnabled && useETLAdapter {
  1576. // data, message, err = a.AdaptETLAggregateCostModel(window, field, subfields, rate, filters, sr, shared, allocateIdle, includeTimeSeries)
  1577. // } else {
  1578. // data, message, err = a.ComputeAggregateCostModel(promClient, duration, offset, field, subfields, rate, filters,
  1579. // sr, shared, allocateIdle, includeTimeSeries, includeEfficiency, disableCache, clearCache, noCache, noExpireCache, remoteEnabled, false)
  1580. // }
  1581. data, message, err = a.ComputeAggregateCostModel(promClient, duration, offset, field, subfields, rate, filters,
  1582. sr, shared, allocateIdle, includeTimeSeries, includeEfficiency, disableCache, clearCache, noCache, noExpireCache, remoteEnabled, false)
  1583. // Find any warnings in http request context
  1584. warning, _ := product.GetWarning(r)
  1585. if err != nil {
  1586. if emptyErr, ok := err.(*EmptyDataError); ok {
  1587. if warning == "" {
  1588. w.Write(WrapData(map[string]interface{}{}, emptyErr))
  1589. } else {
  1590. w.Write(WrapDataWithWarning(map[string]interface{}{}, emptyErr, warning))
  1591. }
  1592. return
  1593. }
  1594. if boundaryErr, ok := err.(*kubecost.BoundaryError); ok {
  1595. if window.Start() != nil && window.Start().After(time.Now().Add(-90*24*time.Hour)) {
  1596. // Asking for data within a 90 day period: it will be available
  1597. // after the pipeline builds
  1598. msg := "Data will be available after ETL is built"
  1599. rex := regexp.MustCompile(`(\d+\.*\d*)%`)
  1600. match := rex.FindStringSubmatch(boundaryErr.Message)
  1601. if len(match) > 1 {
  1602. completionPct, err := strconv.ParseFloat(match[1], 64)
  1603. if err == nil {
  1604. msg = fmt.Sprintf("%s (%.1f%% complete)", msg, completionPct)
  1605. }
  1606. }
  1607. http.Error(w, msg, http.StatusInternalServerError)
  1608. } else {
  1609. // Boundary error outside of 90 day period; may not be available
  1610. http.Error(w, boundaryErr.Error(), http.StatusInternalServerError)
  1611. }
  1612. return
  1613. }
  1614. errStr := fmt.Sprintf("error computing aggregate cost model: %s", err)
  1615. http.Error(w, errStr, http.StatusInternalServerError)
  1616. return
  1617. }
  1618. if warning == "" {
  1619. w.Write(WrapDataWithMessage(data, nil, message))
  1620. } else {
  1621. w.Write(WrapDataWithMessageAndWarning(data, nil, message, warning))
  1622. }
  1623. }