cluster_helpers.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. package costmodel
  2. import (
  3. "strconv"
  4. "time"
  5. "github.com/opencost/opencost/pkg/cloud/models"
  6. "github.com/opencost/opencost/pkg/cloud/provider"
  7. "github.com/opencost/opencost/core/pkg/log"
  8. "github.com/opencost/opencost/core/pkg/opencost"
  9. "github.com/opencost/opencost/pkg/env"
  10. "github.com/opencost/opencost/pkg/prom"
  11. )
  12. // mergeTypeMaps takes two maps of (cluster name, node name) -> node type
  13. // and combines them into a single map, preferring the k/v pairs in
  14. // the first map.
  15. func mergeTypeMaps(clusterAndNameToType1, clusterAndNameToType2 map[nodeIdentifierNoProviderID]string) map[nodeIdentifierNoProviderID]string {
  16. if clusterAndNameToType1 == nil {
  17. log.Debug("untested")
  18. log.Debug("untested")
  19. log.Debug("untested")
  20. log.Debug("untested")
  21. log.Debug("untested")
  22. log.Debug("untested")
  23. log.Debug("untested")
  24. log.Debug("untested")
  25. log.Debug("untested")
  26. log.Debug("untested")
  27. log.Debug("untested")
  28. log.Debug("untested")
  29. log.Debug("untested")
  30. log.Debug("untested")
  31. log.Debug("untested")
  32. log.Debug("untested")
  33. log.Debug("untested")
  34. log.Debug("untested")
  35. log.Debug("untested")
  36. log.Debug("untested")
  37. log.Debug("untested")
  38. log.Debug("untested")
  39. log.Debug("untested")
  40. log.Debug("untested")
  41. log.Debug("untested")
  42. log.Debug("untested")
  43. log.Debug("untested")
  44. log.Debug("untested")
  45. }
  46. merged := map[nodeIdentifierNoProviderID]string{}
  47. for k, v := range clusterAndNameToType2 {
  48. merged[k] = v
  49. }
  50. // This ordering ensures the mappings in the first arg are preferred.
  51. for k, v := range clusterAndNameToType1 {
  52. merged[k] = v
  53. }
  54. return merged
  55. }
  56. func buildCPUCostMap(
  57. resNodeCPUCost []*prom.QueryResult,
  58. cp models.Provider,
  59. preemptible map[NodeIdentifier]bool,
  60. ) (
  61. map[NodeIdentifier]float64,
  62. map[nodeIdentifierNoProviderID]string,
  63. ) {
  64. cpuCostMap := make(map[NodeIdentifier]float64)
  65. clusterAndNameToType := make(map[nodeIdentifierNoProviderID]string)
  66. customPricingEnabled := provider.CustomPricesEnabled(cp)
  67. customPricingConfig, err := cp.GetConfig()
  68. if err != nil {
  69. log.Warnf("ClusterNodes: failed to load custom pricing: %s", err)
  70. }
  71. for _, result := range resNodeCPUCost {
  72. cluster, err := result.GetString(env.GetPromClusterLabel())
  73. if err != nil {
  74. cluster = env.GetClusterID()
  75. }
  76. name, err := result.GetString("node")
  77. if err != nil {
  78. log.Warnf("ClusterNodes: CPU cost data missing node")
  79. continue
  80. }
  81. nodeType, _ := result.GetString("instance_type")
  82. providerID, _ := result.GetString("provider_id")
  83. key := NodeIdentifier{
  84. Cluster: cluster,
  85. Name: name,
  86. ProviderID: provider.ParseID(providerID),
  87. }
  88. keyNon := nodeIdentifierNoProviderID{
  89. Cluster: cluster,
  90. Name: name,
  91. }
  92. var cpuCost float64
  93. if customPricingEnabled && customPricingConfig != nil {
  94. var customCPUStr string
  95. if spot, ok := preemptible[key]; ok && spot {
  96. customCPUStr = customPricingConfig.SpotCPU
  97. } else {
  98. customCPUStr = customPricingConfig.CPU
  99. }
  100. customCPUCost, err := strconv.ParseFloat(customCPUStr, 64)
  101. if err != nil {
  102. log.Warnf("ClusterNodes: error parsing custom CPU price: %s", customCPUStr)
  103. }
  104. cpuCost = customCPUCost
  105. } else {
  106. cpuCost = result.Values[0].Value
  107. }
  108. clusterAndNameToType[keyNon] = nodeType
  109. cpuCostMap[key] = cpuCost
  110. }
  111. return cpuCostMap, clusterAndNameToType
  112. }
  113. func buildRAMCostMap(
  114. resNodeRAMCost []*prom.QueryResult,
  115. cp models.Provider,
  116. preemptible map[NodeIdentifier]bool,
  117. ) (
  118. map[NodeIdentifier]float64,
  119. map[nodeIdentifierNoProviderID]string,
  120. ) {
  121. ramCostMap := make(map[NodeIdentifier]float64)
  122. clusterAndNameToType := make(map[nodeIdentifierNoProviderID]string)
  123. customPricingEnabled := provider.CustomPricesEnabled(cp)
  124. customPricingConfig, err := cp.GetConfig()
  125. if err != nil {
  126. log.Warnf("ClusterNodes: failed to load custom pricing: %s", err)
  127. }
  128. for _, result := range resNodeRAMCost {
  129. cluster, err := result.GetString(env.GetPromClusterLabel())
  130. if err != nil {
  131. cluster = env.GetClusterID()
  132. }
  133. name, err := result.GetString("node")
  134. if err != nil {
  135. log.Warnf("ClusterNodes: RAM cost data missing node")
  136. continue
  137. }
  138. nodeType, _ := result.GetString("instance_type")
  139. providerID, _ := result.GetString("provider_id")
  140. key := NodeIdentifier{
  141. Cluster: cluster,
  142. Name: name,
  143. ProviderID: provider.ParseID(providerID),
  144. }
  145. keyNon := nodeIdentifierNoProviderID{
  146. Cluster: cluster,
  147. Name: name,
  148. }
  149. var ramCost float64
  150. if customPricingEnabled && customPricingConfig != nil {
  151. var customRAMStr string
  152. if spot, ok := preemptible[key]; ok && spot {
  153. customRAMStr = customPricingConfig.SpotRAM
  154. } else {
  155. customRAMStr = customPricingConfig.RAM
  156. }
  157. customRAMCost, err := strconv.ParseFloat(customRAMStr, 64)
  158. if err != nil {
  159. log.Warnf("ClusterNodes: error parsing custom RAM price: %s", customRAMStr)
  160. }
  161. ramCost = customRAMCost / 1024 / 1024 / 1024
  162. } else {
  163. ramCost = result.Values[0].Value
  164. }
  165. clusterAndNameToType[keyNon] = nodeType
  166. ramCostMap[key] = ramCost
  167. }
  168. return ramCostMap, clusterAndNameToType
  169. }
  170. func buildGPUCostMap(
  171. resNodeGPUCost []*prom.QueryResult,
  172. gpuCountMap map[NodeIdentifier]float64,
  173. cp models.Provider,
  174. preemptible map[NodeIdentifier]bool,
  175. ) (
  176. map[NodeIdentifier]float64,
  177. map[nodeIdentifierNoProviderID]string,
  178. ) {
  179. gpuCostMap := make(map[NodeIdentifier]float64)
  180. clusterAndNameToType := make(map[nodeIdentifierNoProviderID]string)
  181. customPricingEnabled := provider.CustomPricesEnabled(cp)
  182. customPricingConfig, err := cp.GetConfig()
  183. if err != nil {
  184. log.Warnf("ClusterNodes: failed to load custom pricing: %s", err)
  185. }
  186. for _, result := range resNodeGPUCost {
  187. cluster, err := result.GetString(env.GetPromClusterLabel())
  188. if err != nil {
  189. cluster = env.GetClusterID()
  190. }
  191. name, err := result.GetString("node")
  192. if err != nil {
  193. log.Warnf("ClusterNodes: GPU cost data missing node")
  194. continue
  195. }
  196. nodeType, _ := result.GetString("instance_type")
  197. providerID, _ := result.GetString("provider_id")
  198. key := NodeIdentifier{
  199. Cluster: cluster,
  200. Name: name,
  201. ProviderID: provider.ParseID(providerID),
  202. }
  203. keyNon := nodeIdentifierNoProviderID{
  204. Cluster: cluster,
  205. Name: name,
  206. }
  207. var gpuCost float64
  208. if customPricingEnabled && customPricingConfig != nil {
  209. var customGPUStr string
  210. if spot, ok := preemptible[key]; ok && spot {
  211. customGPUStr = customPricingConfig.SpotGPU
  212. } else {
  213. customGPUStr = customPricingConfig.GPU
  214. }
  215. customGPUCost, err := strconv.ParseFloat(customGPUStr, 64)
  216. if err != nil {
  217. log.Warnf("ClusterNodes: error parsing custom GPU price: %s", customGPUStr)
  218. }
  219. gpuCost = customGPUCost
  220. } else {
  221. gpuCost = result.Values[0].Value
  222. }
  223. clusterAndNameToType[keyNon] = nodeType
  224. // If gpu count is available use it to multiply gpu cost
  225. if value, ok := gpuCountMap[key]; ok {
  226. gpuCostMap[key] = gpuCost * value
  227. } else {
  228. gpuCostMap[key] = 0
  229. }
  230. }
  231. return gpuCostMap, clusterAndNameToType
  232. }
  233. func buildGPUCountMap(
  234. resNodeGPUCount []*prom.QueryResult,
  235. ) map[NodeIdentifier]float64 {
  236. gpuCountMap := make(map[NodeIdentifier]float64)
  237. for _, result := range resNodeGPUCount {
  238. cluster, err := result.GetString(env.GetPromClusterLabel())
  239. if err != nil {
  240. cluster = env.GetClusterID()
  241. }
  242. name, err := result.GetString("node")
  243. if err != nil {
  244. log.Warnf("ClusterNodes: GPU count data missing node")
  245. continue
  246. }
  247. gpuCount := result.Values[0].Value
  248. providerID, _ := result.GetString("provider_id")
  249. key := NodeIdentifier{
  250. Cluster: cluster,
  251. Name: name,
  252. ProviderID: provider.ParseID(providerID),
  253. }
  254. gpuCountMap[key] = gpuCount
  255. }
  256. return gpuCountMap
  257. }
  258. func buildCPUCoresMap(
  259. resNodeCPUCores []*prom.QueryResult,
  260. ) map[nodeIdentifierNoProviderID]float64 {
  261. m := make(map[nodeIdentifierNoProviderID]float64)
  262. for _, result := range resNodeCPUCores {
  263. cluster, err := result.GetString(env.GetPromClusterLabel())
  264. if err != nil {
  265. cluster = env.GetClusterID()
  266. }
  267. name, err := result.GetString("node")
  268. if err != nil {
  269. log.Warnf("ClusterNodes: CPU cores data missing node")
  270. continue
  271. }
  272. cpuCores := result.Values[0].Value
  273. key := nodeIdentifierNoProviderID{
  274. Cluster: cluster,
  275. Name: name,
  276. }
  277. m[key] = cpuCores
  278. }
  279. return m
  280. }
  281. func buildRAMBytesMap(resNodeRAMBytes []*prom.QueryResult) map[nodeIdentifierNoProviderID]float64 {
  282. m := make(map[nodeIdentifierNoProviderID]float64)
  283. for _, result := range resNodeRAMBytes {
  284. cluster, err := result.GetString(env.GetPromClusterLabel())
  285. if err != nil {
  286. cluster = env.GetClusterID()
  287. }
  288. name, err := result.GetString("node")
  289. if err != nil {
  290. log.Warnf("ClusterNodes: RAM bytes data missing node")
  291. continue
  292. }
  293. ramBytes := result.Values[0].Value
  294. key := nodeIdentifierNoProviderID{
  295. Cluster: cluster,
  296. Name: name,
  297. }
  298. m[key] = ramBytes
  299. }
  300. return m
  301. }
  302. // Mapping of cluster/node=cpu for computing resource efficiency
  303. func buildCPUBreakdownMap(resNodeCPUModeTotal []*prom.QueryResult) map[nodeIdentifierNoProviderID]*ClusterCostsBreakdown {
  304. cpuBreakdownMap := make(map[nodeIdentifierNoProviderID]*ClusterCostsBreakdown)
  305. // Mapping of cluster/node=cpu for computing resource efficiency
  306. clusterNodeCPUTotal := map[nodeIdentifierNoProviderID]float64{}
  307. // Mapping of cluster/node:mode=cpu for computing resource efficiency
  308. clusterNodeModeCPUTotal := map[nodeIdentifierNoProviderID]map[string]float64{}
  309. // Build intermediate structures for CPU usage by (cluster, node) and by
  310. // (cluster, node, mode) for computing resouce efficiency
  311. for _, result := range resNodeCPUModeTotal {
  312. cluster, err := result.GetString(env.GetPromClusterLabel())
  313. if err != nil {
  314. cluster = env.GetClusterID()
  315. }
  316. node, err := result.GetString("kubernetes_node")
  317. if err != nil {
  318. log.DedupedWarningf(5, "ClusterNodes: CPU mode data missing node")
  319. continue
  320. }
  321. mode, err := result.GetString("mode")
  322. if err != nil {
  323. log.Warnf("ClusterNodes: unable to read CPU mode: %s", err)
  324. mode = "other"
  325. }
  326. key := nodeIdentifierNoProviderID{
  327. Cluster: cluster,
  328. Name: node,
  329. }
  330. total := result.Values[0].Value
  331. // Increment total
  332. clusterNodeCPUTotal[key] += total
  333. // Increment mode
  334. if _, ok := clusterNodeModeCPUTotal[key]; !ok {
  335. clusterNodeModeCPUTotal[key] = map[string]float64{}
  336. }
  337. clusterNodeModeCPUTotal[key][mode] += total
  338. }
  339. // Compute resource efficiency from intermediate structures
  340. for key, total := range clusterNodeCPUTotal {
  341. if modeTotals, ok := clusterNodeModeCPUTotal[key]; ok {
  342. for mode, subtotal := range modeTotals {
  343. // Compute percentage for the current cluster, node, mode
  344. pct := 0.0
  345. if total > 0 {
  346. pct = subtotal / total
  347. }
  348. if _, ok := cpuBreakdownMap[key]; !ok {
  349. cpuBreakdownMap[key] = &ClusterCostsBreakdown{}
  350. }
  351. switch mode {
  352. case "idle":
  353. cpuBreakdownMap[key].Idle += pct
  354. case "system":
  355. cpuBreakdownMap[key].System += pct
  356. case "user":
  357. cpuBreakdownMap[key].User += pct
  358. default:
  359. cpuBreakdownMap[key].Other += pct
  360. }
  361. }
  362. }
  363. }
  364. return cpuBreakdownMap
  365. }
  366. func buildOverheadMap(capRam, allocRam, capCPU, allocCPU map[nodeIdentifierNoProviderID]float64) map[nodeIdentifierNoProviderID]*NodeOverhead {
  367. m := make(map[nodeIdentifierNoProviderID]*NodeOverhead, len(capRam))
  368. for identifier, ramCapacity := range capRam {
  369. allocatableRam, ok := allocRam[identifier]
  370. if !ok {
  371. log.Warnf("Could not find allocatable ram for node %s", identifier.Name)
  372. continue
  373. }
  374. overheadBytes := ramCapacity - allocatableRam
  375. m[identifier] = &NodeOverhead{
  376. RamOverheadFraction: overheadBytes / ramCapacity,
  377. }
  378. }
  379. for identifier, cpuCapacity := range capCPU {
  380. allocatableCPU, ok := allocCPU[identifier]
  381. if !ok {
  382. log.Warnf("Could not find allocatable cpu for node %s", identifier.Name)
  383. continue
  384. }
  385. overhead := cpuCapacity - allocatableCPU
  386. if _, found := m[identifier]; found {
  387. m[identifier].CpuOverheadFraction = overhead / cpuCapacity
  388. } else {
  389. m[identifier] = &NodeOverhead{
  390. CpuOverheadFraction: overhead / cpuCapacity,
  391. }
  392. }
  393. }
  394. return m
  395. }
  396. func buildRAMUserPctMap(resNodeRAMUserPct []*prom.QueryResult) map[nodeIdentifierNoProviderID]float64 {
  397. m := make(map[nodeIdentifierNoProviderID]float64)
  398. for _, result := range resNodeRAMUserPct {
  399. cluster, err := result.GetString(env.GetPromClusterLabel())
  400. if err != nil {
  401. cluster = env.GetClusterID()
  402. }
  403. name, err := result.GetString("instance")
  404. if err != nil {
  405. log.Warnf("ClusterNodes: RAM user percent missing node")
  406. continue
  407. }
  408. pct := result.Values[0].Value
  409. key := nodeIdentifierNoProviderID{
  410. Cluster: cluster,
  411. Name: name,
  412. }
  413. m[key] = pct
  414. }
  415. return m
  416. }
  417. func buildRAMSystemPctMap(resNodeRAMSystemPct []*prom.QueryResult) map[nodeIdentifierNoProviderID]float64 {
  418. m := make(map[nodeIdentifierNoProviderID]float64)
  419. for _, result := range resNodeRAMSystemPct {
  420. cluster, err := result.GetString(env.GetPromClusterLabel())
  421. if err != nil {
  422. cluster = env.GetClusterID()
  423. }
  424. name, err := result.GetString("instance")
  425. if err != nil {
  426. log.Warnf("ClusterNodes: RAM system percent missing node")
  427. continue
  428. }
  429. pct := result.Values[0].Value
  430. key := nodeIdentifierNoProviderID{
  431. Cluster: cluster,
  432. Name: name,
  433. }
  434. m[key] = pct
  435. }
  436. return m
  437. }
  438. type activeData struct {
  439. start time.Time
  440. end time.Time
  441. minutes float64
  442. }
  443. func buildActiveDataMap(resActiveMins []*prom.QueryResult, resolution time.Duration, window opencost.Window) map[NodeIdentifier]activeData {
  444. m := make(map[NodeIdentifier]activeData)
  445. for _, result := range resActiveMins {
  446. cluster, err := result.GetString(env.GetPromClusterLabel())
  447. if err != nil {
  448. cluster = env.GetClusterID()
  449. }
  450. name, err := result.GetString("node")
  451. if err != nil {
  452. log.Warnf("ClusterNodes: active mins missing node")
  453. continue
  454. }
  455. providerID, _ := result.GetString("provider_id")
  456. key := NodeIdentifier{
  457. Cluster: cluster,
  458. Name: name,
  459. ProviderID: provider.ParseID(providerID),
  460. }
  461. if len(result.Values) == 0 {
  462. continue
  463. }
  464. s, e := calculateStartAndEnd(result, resolution, window)
  465. mins := e.Sub(s).Minutes()
  466. // TODO niko/assets if mins >= threshold, interpolate for missing data?
  467. m[key] = activeData{
  468. start: s,
  469. end: e,
  470. minutes: mins,
  471. }
  472. }
  473. return m
  474. }
  475. // Determine preemptibility with node labels
  476. // node id -> is preemptible?
  477. func buildPreemptibleMap(
  478. resIsSpot []*prom.QueryResult,
  479. ) map[NodeIdentifier]bool {
  480. m := make(map[NodeIdentifier]bool)
  481. for _, result := range resIsSpot {
  482. nodeName, err := result.GetString("node")
  483. if err != nil {
  484. continue
  485. }
  486. // GCP preemptible label
  487. pre := result.Values[0].Value
  488. cluster, err := result.GetString(env.GetPromClusterLabel())
  489. if err != nil {
  490. cluster = env.GetClusterID()
  491. }
  492. providerID, _ := result.GetString("provider_id")
  493. key := NodeIdentifier{
  494. Cluster: cluster,
  495. Name: nodeName,
  496. ProviderID: provider.ParseID(providerID),
  497. }
  498. // TODO(michaelmdresser): check this condition at merge time?
  499. // if node, ok := nodeMap[key]; pre > 0.0 && ok {
  500. // node.Preemptible = true
  501. // }
  502. m[key] = pre > 0.0
  503. // TODO AWS preemptible
  504. // TODO Azure preemptible
  505. }
  506. return m
  507. }
  508. func buildLabelsMap(
  509. resLabels []*prom.QueryResult,
  510. ) map[nodeIdentifierNoProviderID]map[string]string {
  511. m := make(map[nodeIdentifierNoProviderID]map[string]string)
  512. // Copy labels into node
  513. for _, result := range resLabels {
  514. cluster, err := result.GetString(env.GetPromClusterLabel())
  515. if err != nil {
  516. cluster = env.GetClusterID()
  517. }
  518. node, err := result.GetString("node")
  519. if err != nil {
  520. log.DedupedWarningf(5, "ClusterNodes: label data missing node")
  521. continue
  522. }
  523. key := nodeIdentifierNoProviderID{
  524. Cluster: cluster,
  525. Name: node,
  526. }
  527. // The QueryResult.GetLabels function needs to be called to sanitize the
  528. // ingested label data. This removes the label_ prefix that prometheus
  529. // adds to emitted labels. It also keeps from ingesting prometheus labels
  530. // that aren't a part of the asset.
  531. if _, ok := m[key]; !ok {
  532. m[key] = map[string]string{}
  533. }
  534. for k, l := range result.GetLabels() {
  535. m[key][k] = l
  536. }
  537. }
  538. return m
  539. }
  540. // checkForKeyAndInitIfMissing inits a key in the provided nodemap if
  541. // it does not exist. Intended to be called ONLY by buildNodeMap
  542. func checkForKeyAndInitIfMissing(
  543. nodeMap map[NodeIdentifier]*Node,
  544. key NodeIdentifier,
  545. clusterAndNameToType map[nodeIdentifierNoProviderID]string,
  546. ) {
  547. if _, ok := nodeMap[key]; !ok {
  548. // default nodeType in case we don't have the mapping
  549. var nodeType string
  550. if t, ok := clusterAndNameToType[nodeIdentifierNoProviderID{
  551. Cluster: key.Cluster,
  552. Name: key.Name,
  553. }]; ok {
  554. nodeType = t
  555. } else {
  556. log.Warnf("ClusterNodes: Type does not exist for node identifier %s", key)
  557. }
  558. nodeMap[key] = &Node{
  559. Cluster: key.Cluster,
  560. Name: key.Name,
  561. NodeType: nodeType,
  562. ProviderID: key.ProviderID,
  563. CPUBreakdown: &ClusterCostsBreakdown{},
  564. RAMBreakdown: &ClusterCostsBreakdown{},
  565. }
  566. }
  567. }
  568. // buildNodeMap creates the main set of node data for ClusterNodes from
  569. // the data maps built from Prometheus queries. Some of the Prometheus
  570. // data has access to the provider_id field and some does not. To get
  571. // around this problem, we use the data that includes provider_id
  572. // to build up the definitive set of nodes and then use the data
  573. // with less-specific identifiers (i.e. without provider_id) to fill
  574. // in the remaining fields.
  575. //
  576. // For example, let's say we have nodes identified like so:
  577. // cluster name/node name/provider_id. For the sake of the example,
  578. // we will also limit data to CPU cost, CPU cores, and preemptibility.
  579. //
  580. // We have CPU cost data that looks like this:
  581. // cluster1/node1/prov_node1_A: $10
  582. // cluster1/node1/prov_node1_B: $8
  583. // cluster1/node2/prov_node2: $15
  584. //
  585. // We have Preemptible data that looks like this:
  586. // cluster1/node1/prov_node1_A: true
  587. // cluster1/node1/prov_node1_B: false
  588. // cluster1/node2/prov_node2_B: false
  589. //
  590. // We have CPU cores data that looks like this:
  591. // cluster1/node1: 4
  592. // cluster1/node2: 6
  593. //
  594. // This function first combines the data that is fully identified,
  595. // creating the following:
  596. // cluster1/node1/prov_node1_A: CPUCost($10), Preemptible(true)
  597. // cluster1/node1/prov_node1_B: CPUCost($8), Preemptible(false)
  598. // cluster1/node2/prov_node2: CPUCost($15), Preemptible(false)
  599. //
  600. // It then uses the less-specific data to extend the specific data,
  601. // making the following:
  602. // cluster1/node1/prov_node1_A: CPUCost($10), Preemptible(true), Cores(4)
  603. // cluster1/node1/prov_node1_B: CPUCost($8), Preemptible(false), Cores(4)
  604. // cluster1/node2/prov_node2: CPUCost($15), Preemptible(false), Cores(6)
  605. //
  606. // In the situation where provider_id doesn't exist for any metrics,
  607. // that is the same as all provider_ids being empty strings. If
  608. // provider_id doesn't exist at all, then we (without having to do
  609. // extra work) easily fall back on identifying nodes only by cluster name
  610. // and node name because the provider_id part of the key will always
  611. // be the empty string.
  612. //
  613. // It is worth nothing that, in this approach, if a node is not present
  614. // in the more specific data but is present in the less-specific data,
  615. // that data is never processed into the final node map. For example,
  616. // let's say the CPU cores map has the following entry:
  617. // cluster1/node8: 6
  618. // But none of the maps with provider_id (CPU cost, RAM cost, etc.)
  619. // have an identifier for cluster1/node8 (regardless of provider_id).
  620. // In this situation, the final node map will not have a cluster1/node8
  621. // entry. This could be fixed by iterating over all of the less specific
  622. // identifiers and, inside that iteration, all of the identifiers in
  623. // the node map, but this would introduce a roughly quadratic time
  624. // complexity.
  625. func buildNodeMap(
  626. cpuCostMap, ramCostMap, gpuCostMap, gpuCountMap map[NodeIdentifier]float64,
  627. cpuCoresMap, ramBytesMap, ramUserPctMap,
  628. ramSystemPctMap map[nodeIdentifierNoProviderID]float64,
  629. cpuBreakdownMap map[nodeIdentifierNoProviderID]*ClusterCostsBreakdown,
  630. activeDataMap map[NodeIdentifier]activeData,
  631. preemptibleMap map[NodeIdentifier]bool,
  632. labelsMap map[nodeIdentifierNoProviderID]map[string]string,
  633. clusterAndNameToType map[nodeIdentifierNoProviderID]string,
  634. res time.Duration,
  635. overheadMap map[nodeIdentifierNoProviderID]*NodeOverhead,
  636. ) map[NodeIdentifier]*Node {
  637. nodeMap := make(map[NodeIdentifier]*Node)
  638. // Initialize the map with the most-specific data:
  639. for id, cost := range cpuCostMap {
  640. checkForKeyAndInitIfMissing(nodeMap, id, clusterAndNameToType)
  641. nodeMap[id].CPUCost = cost
  642. }
  643. for id, cost := range ramCostMap {
  644. checkForKeyAndInitIfMissing(nodeMap, id, clusterAndNameToType)
  645. nodeMap[id].RAMCost = cost
  646. }
  647. for id, cost := range gpuCostMap {
  648. checkForKeyAndInitIfMissing(nodeMap, id, clusterAndNameToType)
  649. nodeMap[id].GPUCost = cost
  650. }
  651. for id, count := range gpuCountMap {
  652. checkForKeyAndInitIfMissing(nodeMap, id, clusterAndNameToType)
  653. nodeMap[id].GPUCount = count
  654. }
  655. for id, preemptible := range preemptibleMap {
  656. checkForKeyAndInitIfMissing(nodeMap, id, clusterAndNameToType)
  657. nodeMap[id].Preemptible = preemptible
  658. }
  659. for id, activeData := range activeDataMap {
  660. checkForKeyAndInitIfMissing(nodeMap, id, clusterAndNameToType)
  661. nodeMap[id].Start = activeData.start
  662. nodeMap[id].End = activeData.end
  663. nodeMap[id].Minutes = nodeMap[id].End.Sub(nodeMap[id].Start).Minutes()
  664. }
  665. // We now merge in data that doesn't have a provider id by looping over
  666. // all keys already added and inserting data according to their
  667. // cluster name/node name combos.
  668. for id, nodePtr := range nodeMap {
  669. clusterAndNameID := nodeIdentifierNoProviderID{
  670. Cluster: id.Cluster,
  671. Name: id.Name,
  672. }
  673. if cores, ok := cpuCoresMap[clusterAndNameID]; ok {
  674. nodePtr.CPUCores = cores
  675. if v, ok := partialCPUMap[nodePtr.NodeType]; ok {
  676. if cores > 0 {
  677. nodePtr.CPUCores = v
  678. adjustmentFactor := v / cores
  679. nodePtr.CPUCost = nodePtr.CPUCost * adjustmentFactor
  680. }
  681. }
  682. }
  683. if ramBytes, ok := ramBytesMap[clusterAndNameID]; ok {
  684. nodePtr.RAMBytes = ramBytes
  685. }
  686. if ramUserPct, ok := ramUserPctMap[clusterAndNameID]; ok {
  687. nodePtr.RAMBreakdown.User = ramUserPct
  688. }
  689. if ramSystemPct, ok := ramSystemPctMap[clusterAndNameID]; ok {
  690. nodePtr.RAMBreakdown.System = ramSystemPct
  691. }
  692. if cpuBreakdown, ok := cpuBreakdownMap[clusterAndNameID]; ok {
  693. nodePtr.CPUBreakdown = cpuBreakdown
  694. }
  695. if labels, ok := labelsMap[clusterAndNameID]; ok {
  696. nodePtr.Labels = labels
  697. }
  698. if overhead, ok := overheadMap[clusterAndNameID]; ok {
  699. nodePtr.Overhead = overhead
  700. } else {
  701. // we were unable to compute overhead for this node
  702. // assume default case of no overhead
  703. nodePtr.Overhead = &NodeOverhead{}
  704. log.Warnf("unable to compute overhead for node %s - defaulting to no overhead", clusterAndNameID.Name)
  705. }
  706. }
  707. return nodeMap
  708. }