clustercache.go 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473
  1. package scrape
  2. import (
  3. "fmt"
  4. "slices"
  5. "strconv"
  6. "strings"
  7. "github.com/kubecost/events"
  8. "github.com/opencost/opencost/core/pkg/clustercache"
  9. "github.com/opencost/opencost/core/pkg/log"
  10. "github.com/opencost/opencost/core/pkg/source"
  11. coreutil "github.com/opencost/opencost/core/pkg/util"
  12. "github.com/opencost/opencost/core/pkg/util/promutil"
  13. "github.com/opencost/opencost/modules/collector-source/pkg/event"
  14. "github.com/opencost/opencost/modules/collector-source/pkg/metric"
  15. "github.com/opencost/opencost/modules/collector-source/pkg/util"
  16. "golang.org/x/exp/maps"
  17. v1 "k8s.io/api/core/v1"
  18. "k8s.io/apimachinery/pkg/api/resource"
  19. "k8s.io/apimachinery/pkg/types"
  20. "k8s.io/apimachinery/pkg/util/validation"
  21. )
  22. const unmountedPVsContainer = "unmounted-pvs"
  23. type ClusterCacheScraper struct {
  24. clusterCache clustercache.ClusterCache
  25. }
  26. func newClusterCacheScraper(clusterCache clustercache.ClusterCache) Scraper {
  27. return &ClusterCacheScraper{
  28. clusterCache: clusterCache,
  29. }
  30. }
  31. type pvcKey struct {
  32. name string
  33. namespace string
  34. }
  35. func (ccs *ClusterCacheScraper) Scrape() []metric.Update {
  36. // retrieve objects for scrape
  37. nodes := ccs.clusterCache.GetAllNodes()
  38. deployments := ccs.clusterCache.GetAllDeployments()
  39. namespaces := ccs.clusterCache.GetAllNamespaces()
  40. pods := ccs.clusterCache.GetAllPods()
  41. pvcs := ccs.clusterCache.GetAllPersistentVolumeClaims()
  42. pvs := ccs.clusterCache.GetAllPersistentVolumes()
  43. services := ccs.clusterCache.GetAllServices()
  44. statefulSets := ccs.clusterCache.GetAllStatefulSets()
  45. daemonSets := ccs.clusterCache.GetAllDaemonSets()
  46. jobs := ccs.clusterCache.GetAllJobs()
  47. cronJobs := ccs.clusterCache.GetAllCronJobs()
  48. replicaSets := ccs.clusterCache.GetAllReplicaSets()
  49. resourceQuotas := ccs.clusterCache.GetAllResourceQuotas()
  50. // create scrape indexes. While the pairs being mapped here don't have a 1 to 1 relationship in the general case,
  51. // we are assuming that in the context of a single snapshot of the cluster they are 1 to 1.
  52. nodeNameToUID := make(map[string]types.UID, len(nodes))
  53. for _, node := range nodes {
  54. nodeNameToUID[node.Name] = node.UID
  55. }
  56. namespaceNameToUID := make(map[string]types.UID, len(namespaces))
  57. for _, ns := range namespaces {
  58. namespaceNameToUID[ns.Name] = ns.UID
  59. }
  60. pvcNameToUID := make(map[pvcKey]types.UID, len(pvcs))
  61. for _, pvc := range pvcs {
  62. key := pvcKey{
  63. name: pvc.Name,
  64. namespace: pvc.Namespace,
  65. }
  66. pvcNameToUID[key] = pvc.UID
  67. }
  68. pvNameToUID := make(map[string]types.UID, len(pvs))
  69. for _, pv := range pvs {
  70. pvNameToUID[pv.Name] = pv.UID
  71. }
  72. scrapeFuncs := []ScrapeFunc{
  73. ccs.GetScrapeNodes(nodes),
  74. ccs.GetScrapeDeployments(deployments, namespaceNameToUID),
  75. ccs.GetScrapeNamespaces(namespaces),
  76. ccs.GetScrapePods(pods, pvcs, nodeNameToUID, namespaceNameToUID, pvcNameToUID),
  77. ccs.GetScrapePVCs(pvcs, namespaceNameToUID, pvNameToUID),
  78. ccs.GetScrapePVs(pvs),
  79. ccs.GetScrapeServices(services),
  80. ccs.GetScrapeStatefulSets(statefulSets, namespaceNameToUID),
  81. ccs.GetScrapeDaemonSets(daemonSets, namespaceNameToUID),
  82. ccs.GetScrapeJobs(jobs, namespaceNameToUID),
  83. ccs.GetScrapeCronJobs(cronJobs, namespaceNameToUID),
  84. ccs.GetScrapeReplicaSets(replicaSets, namespaceNameToUID),
  85. ccs.GetScrapeResourceQuotas(resourceQuotas, namespaceNameToUID),
  86. }
  87. return concurrentScrape(scrapeFuncs...)
  88. }
  89. func (ccs *ClusterCacheScraper) GetScrapeNodes(nodes []*clustercache.Node) ScrapeFunc {
  90. return func() []metric.Update {
  91. return ccs.scrapeNodes(nodes)
  92. }
  93. }
  94. func (ccs *ClusterCacheScraper) scrapeNodes(nodes []*clustercache.Node) []metric.Update {
  95. var scrapeResults []metric.Update
  96. for _, node := range nodes {
  97. nodeInfo := map[string]string{
  98. source.NodeLabel: node.Name,
  99. source.ProviderIDLabel: node.SpecProviderID,
  100. source.UIDLabel: string(node.UID),
  101. }
  102. if instanceType, ok := coreutil.GetInstanceType(node.Labels); ok {
  103. nodeInfo[source.InstanceTypeLabel] = instanceType
  104. }
  105. scrapeResults = append(scrapeResults, metric.Update{
  106. Name: metric.NodeInfo,
  107. Labels: nodeInfo,
  108. AdditionalInfo: nodeInfo,
  109. })
  110. // Node Capacity
  111. scrapeResults = scrapeResourceList(
  112. metric.NodeResourceCapacities,
  113. node.Status.Capacity,
  114. nodeInfo,
  115. scrapeResults)
  116. // This block and metric can be removed, when we stop exporting assets and allocations
  117. if node.Status.Capacity != nil {
  118. if quantity, ok := node.Status.Capacity[v1.ResourceCPU]; ok {
  119. _, _, value := toResourceUnitValue(v1.ResourceCPU, quantity)
  120. scrapeResults = append(scrapeResults, metric.Update{
  121. Name: metric.KubeNodeStatusCapacityCPUCores,
  122. Labels: nodeInfo,
  123. Value: value,
  124. })
  125. }
  126. if quantity, ok := node.Status.Capacity[v1.ResourceMemory]; ok {
  127. _, _, value := toResourceUnitValue(v1.ResourceMemory, quantity)
  128. scrapeResults = append(scrapeResults, metric.Update{
  129. Name: metric.KubeNodeStatusCapacityMemoryBytes,
  130. Labels: nodeInfo,
  131. Value: value,
  132. })
  133. }
  134. }
  135. // Node Allocatable Resources
  136. scrapeResults = scrapeResourceList(
  137. metric.NodeResourcesAllocatable,
  138. node.Status.Allocatable,
  139. nodeInfo,
  140. scrapeResults)
  141. // This block and metric can be removed, when we stop exporting assets and allocations
  142. if node.Status.Allocatable != nil {
  143. if quantity, ok := node.Status.Allocatable[v1.ResourceCPU]; ok {
  144. _, _, value := toResourceUnitValue(v1.ResourceCPU, quantity)
  145. scrapeResults = append(scrapeResults, metric.Update{
  146. Name: metric.KubeNodeStatusAllocatableCPUCores,
  147. Labels: nodeInfo,
  148. Value: value,
  149. })
  150. }
  151. if quantity, ok := node.Status.Allocatable[v1.ResourceMemory]; ok {
  152. _, _, value := toResourceUnitValue(v1.ResourceMemory, quantity)
  153. scrapeResults = append(scrapeResults, metric.Update{
  154. Name: metric.KubeNodeStatusAllocatableMemoryBytes,
  155. Labels: nodeInfo,
  156. Value: value,
  157. })
  158. }
  159. }
  160. // node labels
  161. labelNames, labelValues := promutil.KubeLabelsToLabels(node.Labels)
  162. nodeLabels := util.ToMap(labelNames, labelValues)
  163. scrapeResults = append(scrapeResults, metric.Update{
  164. Name: metric.KubeNodeLabels,
  165. Labels: nodeInfo,
  166. Value: 0,
  167. AdditionalInfo: nodeLabels,
  168. })
  169. }
  170. events.Dispatch(event.ScrapeEvent{
  171. ScraperName: event.KubernetesClusterScraperName,
  172. ScrapeType: event.NodeScraperType,
  173. Targets: len(nodes),
  174. Errors: nil,
  175. })
  176. return scrapeResults
  177. }
  178. func (ccs *ClusterCacheScraper) GetScrapeDeployments(deployments []*clustercache.Deployment, namespaceIndex map[string]types.UID) ScrapeFunc {
  179. return func() []metric.Update {
  180. return ccs.scrapeDeployments(deployments, namespaceIndex)
  181. }
  182. }
  183. func (ccs *ClusterCacheScraper) scrapeDeployments(deployments []*clustercache.Deployment, namespaceIndex map[string]types.UID) []metric.Update {
  184. var scrapeResults []metric.Update
  185. for _, deployment := range deployments {
  186. nsUID, ok := namespaceIndex[deployment.Namespace]
  187. if !ok {
  188. log.Debugf("deployment namespaceUID missing from index for namespace name '%s'", deployment.Namespace)
  189. }
  190. deploymentInfo := map[string]string{
  191. source.UIDLabel: string(deployment.UID),
  192. source.NamespaceUIDLabel: string(nsUID),
  193. source.NamespaceLabel: deployment.Namespace,
  194. source.DeploymentLabel: deployment.Name,
  195. }
  196. scrapeResults = append(scrapeResults, metric.Update{
  197. Name: metric.DeploymentInfo,
  198. Labels: deploymentInfo,
  199. Value: 0,
  200. AdditionalInfo: deploymentInfo,
  201. })
  202. // deployment labels
  203. labelNames, labelValues := promutil.KubeLabelsToLabels(deployment.Labels)
  204. deploymentLabels := util.ToMap(labelNames, labelValues)
  205. scrapeResults = append(scrapeResults, metric.Update{
  206. Name: metric.DeploymentLabels,
  207. Labels: deploymentInfo,
  208. Value: 0,
  209. AdditionalInfo: deploymentLabels,
  210. })
  211. // deployment annotations
  212. annoationNames, annotationValues := promutil.KubeAnnotationsToLabels(deployment.Annotations)
  213. deploymentAnnotations := util.ToMap(annoationNames, annotationValues)
  214. scrapeResults = append(scrapeResults, metric.Update{
  215. Name: metric.DeploymentAnnotations,
  216. Labels: deploymentInfo,
  217. Value: 0,
  218. AdditionalInfo: deploymentAnnotations,
  219. })
  220. // deployment match labels
  221. matchLabelNames, matchLabelValues := promutil.KubeLabelsToLabels(deployment.MatchLabels)
  222. deploymentMatchLabels := util.ToMap(matchLabelNames, matchLabelValues)
  223. scrapeResults = append(scrapeResults, metric.Update{
  224. Name: metric.DeploymentMatchLabels,
  225. Labels: deploymentInfo,
  226. Value: 0,
  227. AdditionalInfo: deploymentMatchLabels,
  228. })
  229. }
  230. events.Dispatch(event.ScrapeEvent{
  231. ScraperName: event.KubernetesClusterScraperName,
  232. ScrapeType: event.DeploymentScraperType,
  233. Targets: len(deployments),
  234. Errors: nil,
  235. })
  236. return scrapeResults
  237. }
  238. func (ccs *ClusterCacheScraper) GetScrapeNamespaces(namespaces []*clustercache.Namespace) ScrapeFunc {
  239. return func() []metric.Update {
  240. return ccs.scrapeNamespaces(namespaces)
  241. }
  242. }
  243. func (ccs *ClusterCacheScraper) scrapeNamespaces(namespaces []*clustercache.Namespace) []metric.Update {
  244. var scrapeResults []metric.Update
  245. for _, namespace := range namespaces {
  246. namespaceInfo := map[string]string{
  247. source.NamespaceLabel: namespace.Name,
  248. source.UIDLabel: string(namespace.UID),
  249. }
  250. scrapeResults = append(scrapeResults, metric.Update{
  251. Name: metric.NamespaceInfo,
  252. Labels: namespaceInfo,
  253. AdditionalInfo: namespaceInfo,
  254. Value: 0,
  255. })
  256. // namespace labels
  257. labelNames, labelValues := promutil.KubeLabelsToLabels(namespace.Labels)
  258. namespaceLabels := util.ToMap(labelNames, labelValues)
  259. scrapeResults = append(scrapeResults, metric.Update{
  260. Name: metric.KubeNamespaceLabels,
  261. Labels: namespaceInfo,
  262. Value: 0,
  263. AdditionalInfo: namespaceLabels,
  264. })
  265. // namespace annotations
  266. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(namespace.Annotations)
  267. namespaceAnnotations := util.ToMap(annotationNames, annotationValues)
  268. scrapeResults = append(scrapeResults, metric.Update{
  269. Name: metric.KubeNamespaceAnnotations,
  270. Labels: namespaceInfo,
  271. Value: 0,
  272. AdditionalInfo: namespaceAnnotations,
  273. })
  274. }
  275. events.Dispatch(event.ScrapeEvent{
  276. ScraperName: event.KubernetesClusterScraperName,
  277. ScrapeType: event.NamespaceScraperType,
  278. Targets: len(namespaces),
  279. Errors: nil,
  280. })
  281. return scrapeResults
  282. }
  283. func (ccs *ClusterCacheScraper) GetScrapePods(
  284. pods []*clustercache.Pod,
  285. pvcs []*clustercache.PersistentVolumeClaim,
  286. nodeIndex map[string]types.UID,
  287. namespaceIndex map[string]types.UID,
  288. pvcIndex map[pvcKey]types.UID,
  289. ) ScrapeFunc {
  290. return func() []metric.Update {
  291. return ccs.scrapePods(pods, pvcs, nodeIndex, namespaceIndex, pvcIndex)
  292. }
  293. }
  294. func (ccs *ClusterCacheScraper) scrapePods(
  295. pods []*clustercache.Pod,
  296. pvcs []*clustercache.PersistentVolumeClaim,
  297. nodeIndex map[string]types.UID,
  298. namespaceIndex map[string]types.UID,
  299. pvcIndex map[pvcKey]types.UID,
  300. ) []metric.Update {
  301. // this is only populated if we find gpu resources being requested
  302. var nodesGpuInfo map[string]*NodeGpuInfo
  303. // pv allocation and unmounted pvs
  304. pvcInfo := getPvcsInfo(pvcs)
  305. // pod info by uid
  306. podInfoByUid := make(map[string]map[string]string)
  307. var scrapeResults []metric.Update
  308. for _, pod := range pods {
  309. nodeUID, ok := nodeIndex[pod.Spec.NodeName]
  310. if !ok {
  311. log.Debugf("pod nodeUID missing from index for node name '%s'", pod.Spec.NodeName)
  312. }
  313. nsUID, ok := namespaceIndex[pod.Namespace]
  314. if !ok {
  315. log.Debugf("pod namespaceUID missing from index for namespace name '%s'", pod.Namespace)
  316. }
  317. podInfo := map[string]string{
  318. source.UIDLabel: string(pod.UID),
  319. source.PodLabel: pod.Name,
  320. source.NamespaceUIDLabel: string(nsUID),
  321. source.NodeUIDLabel: string(nodeUID),
  322. }
  323. scrapeResults = append(scrapeResults, metric.Update{
  324. Name: metric.PodInfo,
  325. Labels: podInfo,
  326. Value: 0,
  327. AdditionalInfo: podInfo,
  328. })
  329. podInfo[source.NamespaceLabel] = pod.Namespace
  330. podInfo[source.NodeLabel] = pod.Spec.NodeName
  331. podInfo[source.InstanceLabel] = pod.Spec.NodeName
  332. podInfoByUid[string(pod.UID)] = podInfo
  333. // pod labels
  334. labelNames, labelValues := promutil.KubeLabelsToLabels(pod.Labels)
  335. podLabels := util.ToMap(labelNames, labelValues)
  336. scrapeResults = append(scrapeResults, metric.Update{
  337. Name: metric.KubePodLabels,
  338. Labels: podInfo,
  339. Value: 0,
  340. AdditionalInfo: podLabels,
  341. })
  342. // pod annotations
  343. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(pod.Annotations)
  344. podAnnotations := util.ToMap(annotationNames, annotationValues)
  345. scrapeResults = append(scrapeResults, metric.Update{
  346. Name: metric.KubePodAnnotations,
  347. Labels: podInfo,
  348. Value: 0,
  349. AdditionalInfo: podAnnotations,
  350. })
  351. // Determine PVC use data for Pod
  352. claimed := make(map[string]struct{})
  353. for _, volume := range pod.Spec.Volumes {
  354. if volume.PersistentVolumeClaim != nil {
  355. name := volume.PersistentVolumeClaim.ClaimName
  356. key := pod.Namespace + "," + name
  357. if _, seen := claimed[key]; seen {
  358. continue
  359. }
  360. if pvc, ok := pvcInfo[key]; ok {
  361. pvc.PodsClaimed = append(pvc.PodsClaimed, string(pod.UID))
  362. claimed[key] = struct{}{}
  363. }
  364. }
  365. }
  366. // Pod owner metric
  367. for _, owner := range pod.OwnerReferences {
  368. controller := "false"
  369. if owner.Controller != nil && *owner.Controller {
  370. controller = "true"
  371. }
  372. ownerInfo := maps.Clone(podInfo)
  373. ownerInfo[source.OwnerKindLabel] = owner.Kind
  374. ownerInfo[source.OwnerNameLabel] = owner.Name
  375. ownerInfo[source.OwnerUIDLabel] = string(owner.UID)
  376. ownerInfo[source.ContainerLabel] = controller
  377. scrapeResults = append(scrapeResults, metric.Update{
  378. Name: metric.KubePodOwner,
  379. Labels: ownerInfo,
  380. Value: 0,
  381. })
  382. }
  383. // Container Status
  384. for _, status := range pod.Status.ContainerStatuses {
  385. if status.State.Running != nil {
  386. containerInfo := maps.Clone(podInfo)
  387. containerInfo[source.ContainerLabel] = status.Name
  388. scrapeResults = append(scrapeResults, metric.Update{
  389. Name: metric.KubePodContainerStatusRunning,
  390. Labels: containerInfo,
  391. AdditionalInfo: containerInfo,
  392. Value: 0,
  393. })
  394. }
  395. }
  396. for _, volume := range pod.Spec.Volumes {
  397. if volume.PersistentVolumeClaim != nil {
  398. pvcUID, ok := pvcIndex[pvcKey{
  399. name: volume.PersistentVolumeClaim.ClaimName,
  400. namespace: pod.Namespace,
  401. }]
  402. if !ok {
  403. continue
  404. }
  405. podPVCVolumeInfo := map[string]string{
  406. source.UIDLabel: string(pod.UID),
  407. source.PVCUIDLabel: string(pvcUID),
  408. source.PodVolumeNameLabel: volume.Name,
  409. }
  410. scrapeResults = append(scrapeResults, metric.Update{
  411. Name: metric.PodPVCVolume,
  412. Labels: podPVCVolumeInfo,
  413. Value: 0,
  414. })
  415. }
  416. }
  417. for _, container := range pod.Spec.Containers {
  418. containerInfo := maps.Clone(podInfo)
  419. containerInfo[source.ContainerLabel] = container.Name
  420. // Requests
  421. scrapeResults = scrapeResourceList(
  422. metric.KubePodContainerResourceRequests,
  423. container.Resources.Requests,
  424. containerInfo,
  425. scrapeResults)
  426. // Limits
  427. scrapeResults = scrapeResourceList(
  428. metric.KubePodContainerResourceLimits,
  429. container.Resources.Limits,
  430. containerInfo,
  431. scrapeResults)
  432. // Todo remove when asset/allocation pipeline are removed
  433. // gpu "requests" is either the request or limit if it exists
  434. var gpuRequest *float64
  435. for resourceName, quantity := range container.Resources.Requests {
  436. if isGpuResourceName(resourceName) {
  437. // set gpu request if it exists
  438. _, _, value := toResourceUnitValue(resourceName, quantity)
  439. gpuRequestValue := value
  440. gpuRequest = &gpuRequestValue
  441. break
  442. }
  443. }
  444. // Limits
  445. if gpuRequest == nil {
  446. for resourceName, quantity := range container.Resources.Limits {
  447. if isGpuResourceName(resourceName) {
  448. // set gpu request if it exists
  449. _, _, value := toResourceUnitValue(resourceName, quantity)
  450. gpuRequestValue := value
  451. gpuRequest = &gpuRequestValue
  452. break
  453. }
  454. }
  455. }
  456. // handle the GPU allocation metric here IFF there exists a request/limit for GPUs
  457. // we only load the node gpu data map if we run into a container with gpu requests/limits
  458. if gpuRequest != nil {
  459. if nodesGpuInfo == nil {
  460. nodesGpuInfo = ccs.getNodesGpuInfo()
  461. }
  462. gpuAlloc := *gpuRequest
  463. if nodeGpuInfo, ok := nodesGpuInfo[pod.Spec.NodeName]; ok {
  464. if nodeGpuInfo != nil && nodeGpuInfo.VGPU != 0 {
  465. gpuAlloc = gpuAlloc * (nodeGpuInfo.GPU / nodeGpuInfo.VGPU)
  466. }
  467. }
  468. scrapeResults = append(scrapeResults, metric.Update{
  469. Name: metric.ContainerGPUAllocation,
  470. Labels: maps.Clone(containerInfo),
  471. Value: gpuAlloc,
  472. })
  473. }
  474. }
  475. }
  476. // Iterate through PVC Info after the pods have been tallied and export
  477. // allocation metrics based on the number of other pods claiming the volume
  478. for _, pvc := range pvcInfo {
  479. // unmounted pvs get full allocation
  480. if len(pvc.PodsClaimed) == 0 {
  481. labels := map[string]string{
  482. source.PodLabel: unmountedPVsContainer,
  483. source.NamespaceLabel: pvc.Namespace,
  484. source.PVCLabel: pvc.Claim,
  485. source.PVLabel: pvc.VolumeName,
  486. }
  487. scrapeResults = append(scrapeResults, metric.Update{
  488. Name: metric.PodPVCAllocation,
  489. Labels: labels,
  490. Value: pvc.Requests,
  491. })
  492. continue
  493. }
  494. // pods get a proportion of pv allocation
  495. value := pvc.Requests / float64(len(pvc.PodsClaimed))
  496. for _, podUid := range pvc.PodsClaimed {
  497. podInfo, ok := podInfoByUid[podUid]
  498. if !ok {
  499. continue
  500. }
  501. pvcLabels := maps.Clone(podInfo)
  502. pvcLabels[source.PVCLabel] = pvc.Claim
  503. pvcLabels[source.PVLabel] = pvc.VolumeName
  504. scrapeResults = append(scrapeResults, metric.Update{
  505. Name: metric.PodPVCAllocation,
  506. Labels: pvcLabels,
  507. Value: value,
  508. })
  509. }
  510. }
  511. events.Dispatch(event.ScrapeEvent{
  512. ScraperName: event.KubernetesClusterScraperName,
  513. ScrapeType: event.PodScraperType,
  514. Targets: len(pods),
  515. Errors: nil,
  516. })
  517. return scrapeResults
  518. }
  519. func scrapeResourceList(metricName string, resourceList v1.ResourceList, baseLabels map[string]string, scrapeResults []metric.Update) []metric.Update {
  520. if resourceList != nil {
  521. // sorting keys here for testing purposes
  522. keys := maps.Keys(resourceList)
  523. slices.Sort(keys)
  524. for _, resourceName := range keys {
  525. quantity := resourceList[resourceName]
  526. resource, unit, value := toResourceUnitValue(resourceName, quantity)
  527. // failed to parse the resource type
  528. if resource == "" {
  529. log.DedupedWarningf(5, "Failed to parse resource units and quantity for resource: %s", resourceName)
  530. continue
  531. }
  532. resourceRequestInfo := maps.Clone(baseLabels)
  533. resourceRequestInfo[source.ResourceLabel] = resource
  534. resourceRequestInfo[source.UnitLabel] = unit
  535. scrapeResults = append(scrapeResults, metric.Update{
  536. Name: metricName,
  537. Labels: resourceRequestInfo,
  538. Value: value,
  539. })
  540. }
  541. }
  542. return scrapeResults
  543. }
  544. func (ccs *ClusterCacheScraper) GetScrapePVCs(
  545. pvcs []*clustercache.PersistentVolumeClaim,
  546. namespaceIndex map[string]types.UID,
  547. pvIndex map[string]types.UID,
  548. ) ScrapeFunc {
  549. return func() []metric.Update {
  550. return ccs.scrapePVCs(pvcs, namespaceIndex, pvIndex)
  551. }
  552. }
  553. func (ccs *ClusterCacheScraper) scrapePVCs(
  554. pvcs []*clustercache.PersistentVolumeClaim,
  555. namespaceIndex map[string]types.UID,
  556. pvIndex map[string]types.UID,
  557. ) []metric.Update {
  558. var scrapeResults []metric.Update
  559. for _, pvc := range pvcs {
  560. nsUID, ok := namespaceIndex[pvc.Namespace]
  561. if !ok {
  562. log.Debugf("pvc namespaceUID missing from index for namespace name '%s'", pvc.Namespace)
  563. }
  564. pvUID, ok := pvIndex[pvc.Spec.VolumeName]
  565. if !ok && pvc.Spec.VolumeName != "" {
  566. log.Debugf("pvc volume name missing from index for pv name '%s'", pvc.Spec.VolumeName)
  567. }
  568. pvcInfo := map[string]string{
  569. source.UIDLabel: string(pvc.UID),
  570. source.PVCLabel: pvc.Name,
  571. source.NamespaceUIDLabel: string(nsUID),
  572. source.NamespaceLabel: pvc.Namespace,
  573. source.VolumeNameLabel: pvc.Spec.VolumeName,
  574. source.PVUIDLabel: string(pvUID),
  575. source.StorageClassLabel: getPersistentVolumeClaimClass(pvc),
  576. }
  577. scrapeResults = append(scrapeResults, metric.Update{
  578. Name: metric.KubePersistentVolumeClaimInfo,
  579. Labels: pvcInfo,
  580. AdditionalInfo: pvcInfo,
  581. Value: 0,
  582. })
  583. if storage, ok := pvc.Spec.Resources.Requests[v1.ResourceStorage]; ok {
  584. scrapeResults = append(scrapeResults, metric.Update{
  585. Name: metric.KubePersistentVolumeClaimResourceRequestsStorageBytes,
  586. Labels: pvcInfo,
  587. Value: float64(storage.Value()),
  588. })
  589. }
  590. }
  591. events.Dispatch(event.ScrapeEvent{
  592. ScraperName: event.KubernetesClusterScraperName,
  593. ScrapeType: event.PvcScraperType,
  594. Targets: len(pvcs),
  595. Errors: nil,
  596. })
  597. return scrapeResults
  598. }
  599. func (ccs *ClusterCacheScraper) GetScrapePVs(pvs []*clustercache.PersistentVolume) ScrapeFunc {
  600. return func() []metric.Update {
  601. return ccs.scrapePVs(pvs)
  602. }
  603. }
  604. func (ccs *ClusterCacheScraper) scrapePVs(pvs []*clustercache.PersistentVolume) []metric.Update {
  605. var scrapeResults []metric.Update
  606. for _, pv := range pvs {
  607. providerID := pv.Name
  608. var csiVolumeHandle string
  609. // if a more accurate provider ID is available, use that
  610. if pv.Spec.CSI != nil && pv.Spec.CSI.VolumeHandle != "" {
  611. providerID = pv.Spec.CSI.VolumeHandle
  612. csiVolumeHandle = pv.Spec.CSI.VolumeHandle
  613. }
  614. pvInfo := map[string]string{
  615. source.UIDLabel: string(pv.UID),
  616. source.PVLabel: pv.Name,
  617. source.StorageClassLabel: pv.Spec.StorageClassName,
  618. source.ProviderIDLabel: providerID,
  619. source.CSIVolumeHandleLabel: csiVolumeHandle,
  620. }
  621. scrapeResults = append(scrapeResults, metric.Update{
  622. Name: metric.KubecostPVInfo,
  623. Labels: pvInfo,
  624. AdditionalInfo: pvInfo,
  625. Value: 0,
  626. })
  627. if storage, ok := pv.Spec.Capacity[v1.ResourceStorage]; ok {
  628. scrapeResults = append(scrapeResults, metric.Update{
  629. Name: metric.KubePersistentVolumeCapacityBytes,
  630. Labels: pvInfo,
  631. Value: float64(storage.Value()),
  632. })
  633. }
  634. }
  635. events.Dispatch(event.ScrapeEvent{
  636. ScraperName: event.KubernetesClusterScraperName,
  637. ScrapeType: event.PvScraperType,
  638. Targets: len(pvs),
  639. Errors: nil,
  640. })
  641. return scrapeResults
  642. }
  643. func (ccs *ClusterCacheScraper) GetScrapeServices(services []*clustercache.Service) ScrapeFunc {
  644. return func() []metric.Update {
  645. return ccs.scrapeServices(services)
  646. }
  647. }
  648. func (ccs *ClusterCacheScraper) scrapeServices(services []*clustercache.Service) []metric.Update {
  649. var scrapeResults []metric.Update
  650. for _, service := range services {
  651. serviceInfo := map[string]string{
  652. source.UIDLabel: string(service.UID),
  653. source.ServiceLabel: service.Name,
  654. source.NamespaceLabel: service.Namespace,
  655. source.ServiceTypeLabel: string(service.Type),
  656. }
  657. scrapeResults = append(scrapeResults, metric.Update{
  658. Name: metric.ServiceInfo,
  659. Labels: serviceInfo,
  660. Value: 0,
  661. AdditionalInfo: serviceInfo,
  662. })
  663. // service selector labels
  664. selectorNames, selectorValues := promutil.KubeLabelsToLabels(service.SpecSelector)
  665. serviceLabels := util.ToMap(selectorNames, selectorValues)
  666. scrapeResults = append(scrapeResults, metric.Update{
  667. Name: metric.ServiceSelectorLabels,
  668. Labels: serviceInfo,
  669. Value: 0,
  670. AdditionalInfo: serviceLabels,
  671. })
  672. }
  673. events.Dispatch(event.ScrapeEvent{
  674. ScraperName: event.KubernetesClusterScraperName,
  675. ScrapeType: event.ServiceScraperType,
  676. Targets: len(services),
  677. Errors: nil,
  678. })
  679. return scrapeResults
  680. }
  681. func (ccs *ClusterCacheScraper) GetScrapeStatefulSets(statefulSets []*clustercache.StatefulSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  682. return func() []metric.Update {
  683. return ccs.scrapeStatefulSets(statefulSets, namespaceIndex)
  684. }
  685. }
  686. func (ccs *ClusterCacheScraper) scrapeStatefulSets(statefulSets []*clustercache.StatefulSet, namespaceIndex map[string]types.UID) []metric.Update {
  687. var scrapeResults []metric.Update
  688. for _, statefulSet := range statefulSets {
  689. nsUID, ok := namespaceIndex[statefulSet.Namespace]
  690. if !ok {
  691. log.Debugf("statefulSet namespaceUID missing from index for namespace name '%s'", statefulSet.Namespace)
  692. }
  693. statefulSetInfo := map[string]string{
  694. source.UIDLabel: string(statefulSet.UID),
  695. source.NamespaceUIDLabel: string(nsUID),
  696. source.StatefulSetLabel: statefulSet.Name,
  697. }
  698. // statefulSet info
  699. scrapeResults = append(scrapeResults, metric.Update{
  700. Name: metric.StatefulSetInfo,
  701. Labels: statefulSetInfo,
  702. Value: 0,
  703. AdditionalInfo: statefulSetInfo,
  704. })
  705. // statefulSet labels
  706. labelNames, labelValues := promutil.KubeLabelsToLabels(statefulSet.Labels)
  707. statefulSetLabels := util.ToMap(labelNames, labelValues)
  708. scrapeResults = append(scrapeResults, metric.Update{
  709. Name: metric.StatefulSetLabels,
  710. Labels: statefulSetInfo,
  711. Value: 0,
  712. AdditionalInfo: statefulSetLabels,
  713. })
  714. // statefulSet annotations
  715. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(statefulSet.Annotations)
  716. statefulSetAnnotations := util.ToMap(annotationNames, annotationValues)
  717. scrapeResults = append(scrapeResults, metric.Update{
  718. Name: metric.StatefulSetAnnotations,
  719. Labels: statefulSetInfo,
  720. Value: 0,
  721. AdditionalInfo: statefulSetAnnotations,
  722. })
  723. // statefulSet match labels
  724. statefulSetInfo[source.NamespaceLabel] = statefulSet.Namespace
  725. matchLabelNames, matchLabelValues := promutil.KubeLabelsToLabels(statefulSet.SpecSelector.MatchLabels)
  726. statefulSetMatchLabels := util.ToMap(matchLabelNames, matchLabelValues)
  727. scrapeResults = append(scrapeResults, metric.Update{
  728. Name: metric.StatefulSetMatchLabels,
  729. Labels: statefulSetInfo,
  730. Value: 0,
  731. AdditionalInfo: statefulSetMatchLabels,
  732. })
  733. }
  734. events.Dispatch(event.ScrapeEvent{
  735. ScraperName: event.KubernetesClusterScraperName,
  736. ScrapeType: event.StatefulSetScraperType,
  737. Targets: len(statefulSets),
  738. Errors: nil,
  739. })
  740. return scrapeResults
  741. }
  742. func (ccs *ClusterCacheScraper) GetScrapeDaemonSets(daemonSets []*clustercache.DaemonSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  743. return func() []metric.Update {
  744. return ccs.scrapeDaemonSets(daemonSets, namespaceIndex)
  745. }
  746. }
  747. func (ccs *ClusterCacheScraper) scrapeDaemonSets(daemonSets []*clustercache.DaemonSet, namespaceIndex map[string]types.UID) []metric.Update {
  748. var scrapeResults []metric.Update
  749. for _, daemonSet := range daemonSets {
  750. nsUID, ok := namespaceIndex[daemonSet.Namespace]
  751. if !ok {
  752. log.Debugf("daemonSet namespaceUID missing from index for namespace name '%s'", daemonSet.Namespace)
  753. }
  754. daemonSetInfo := map[string]string{
  755. source.UIDLabel: string(daemonSet.UID),
  756. source.NamespaceUIDLabel: string(nsUID),
  757. source.DaemonSetLabel: daemonSet.Name,
  758. }
  759. // daemonSet info
  760. scrapeResults = append(scrapeResults, metric.Update{
  761. Name: metric.DaemonSetInfo,
  762. Labels: daemonSetInfo,
  763. Value: 0,
  764. AdditionalInfo: daemonSetInfo,
  765. })
  766. // daemonSet labels
  767. labelNames, labelValues := promutil.KubeLabelsToLabels(daemonSet.Labels)
  768. daemonSetLabels := util.ToMap(labelNames, labelValues)
  769. scrapeResults = append(scrapeResults, metric.Update{
  770. Name: metric.DaemonSetLabels,
  771. Labels: daemonSetInfo,
  772. Value: 0,
  773. AdditionalInfo: daemonSetLabels,
  774. })
  775. // daemonSet annotations
  776. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(daemonSet.Annotations)
  777. daemonSetAnnotations := util.ToMap(annotationNames, annotationValues)
  778. scrapeResults = append(scrapeResults, metric.Update{
  779. Name: metric.DaemonSetAnnotations,
  780. Labels: daemonSetInfo,
  781. Value: 0,
  782. AdditionalInfo: daemonSetAnnotations,
  783. })
  784. }
  785. events.Dispatch(event.ScrapeEvent{
  786. ScraperName: event.KubernetesClusterScraperName,
  787. ScrapeType: event.DaemonSetScraperType,
  788. Targets: len(daemonSets),
  789. Errors: nil,
  790. })
  791. return scrapeResults
  792. }
  793. func (ccs *ClusterCacheScraper) GetScrapeJobs(jobs []*clustercache.Job, namespaceIndex map[string]types.UID) ScrapeFunc {
  794. return func() []metric.Update {
  795. return ccs.scrapeJobs(jobs, namespaceIndex)
  796. }
  797. }
  798. func (ccs *ClusterCacheScraper) scrapeJobs(jobs []*clustercache.Job, namespaceIndex map[string]types.UID) []metric.Update {
  799. var scrapeResults []metric.Update
  800. for _, job := range jobs {
  801. nsUID, ok := namespaceIndex[job.Namespace]
  802. if !ok {
  803. log.Debugf("job namespaceUID missing from index for namespace name '%s'", job.Namespace)
  804. }
  805. jobInfo := map[string]string{
  806. source.UIDLabel: string(job.UID),
  807. source.NamespaceUIDLabel: string(nsUID),
  808. source.JobLabel: job.Name,
  809. }
  810. // job info
  811. scrapeResults = append(scrapeResults, metric.Update{
  812. Name: metric.JobInfo,
  813. Labels: jobInfo,
  814. Value: 0,
  815. AdditionalInfo: jobInfo,
  816. })
  817. // job labels
  818. labelNames, labelValues := promutil.KubeLabelsToLabels(job.Labels)
  819. jobLabels := util.ToMap(labelNames, labelValues)
  820. scrapeResults = append(scrapeResults, metric.Update{
  821. Name: metric.JobLabels,
  822. Labels: jobInfo,
  823. Value: 0,
  824. AdditionalInfo: jobLabels,
  825. })
  826. // job annotations
  827. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(job.Annotations)
  828. jobAnnotations := util.ToMap(annotationNames, annotationValues)
  829. scrapeResults = append(scrapeResults, metric.Update{
  830. Name: metric.JobAnnotations,
  831. Labels: jobInfo,
  832. Value: 0,
  833. AdditionalInfo: jobAnnotations,
  834. })
  835. }
  836. events.Dispatch(event.ScrapeEvent{
  837. ScraperName: event.KubernetesClusterScraperName,
  838. ScrapeType: event.JobScraperType,
  839. Targets: len(jobs),
  840. Errors: nil,
  841. })
  842. return scrapeResults
  843. }
  844. func (ccs *ClusterCacheScraper) GetScrapeCronJobs(cronJobs []*clustercache.CronJob, namespaceIndex map[string]types.UID) ScrapeFunc {
  845. return func() []metric.Update {
  846. return ccs.scrapeCronJobs(cronJobs, namespaceIndex)
  847. }
  848. }
  849. func (ccs *ClusterCacheScraper) scrapeCronJobs(cronJobs []*clustercache.CronJob, namespaceIndex map[string]types.UID) []metric.Update {
  850. var scrapeResults []metric.Update
  851. for _, cronJob := range cronJobs {
  852. nsUID, ok := namespaceIndex[cronJob.Namespace]
  853. if !ok {
  854. log.Debugf("cronjob namespaceUID missing from index for namespace name '%s'", cronJob.Namespace)
  855. }
  856. cronJobInfo := map[string]string{
  857. source.UIDLabel: string(cronJob.UID),
  858. source.NamespaceUIDLabel: string(nsUID),
  859. source.CronJobLabel: cronJob.Name,
  860. }
  861. // cronjob info
  862. scrapeResults = append(scrapeResults, metric.Update{
  863. Name: metric.CronJobInfo,
  864. Labels: cronJobInfo,
  865. Value: 0,
  866. AdditionalInfo: cronJobInfo,
  867. })
  868. // cronjob labels
  869. labelNames, labelValues := promutil.KubeLabelsToLabels(cronJob.Labels)
  870. cronJobLabels := util.ToMap(labelNames, labelValues)
  871. scrapeResults = append(scrapeResults, metric.Update{
  872. Name: metric.CronJobLabels,
  873. Labels: cronJobInfo,
  874. Value: 0,
  875. AdditionalInfo: cronJobLabels,
  876. })
  877. // cronjob annotations
  878. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(cronJob.Annotations)
  879. cronJobAnnotations := util.ToMap(annotationNames, annotationValues)
  880. scrapeResults = append(scrapeResults, metric.Update{
  881. Name: metric.CronJobAnnotations,
  882. Labels: cronJobInfo,
  883. Value: 0,
  884. AdditionalInfo: cronJobAnnotations,
  885. })
  886. }
  887. events.Dispatch(event.ScrapeEvent{
  888. ScraperName: event.KubernetesClusterScraperName,
  889. ScrapeType: event.CronJobScraperType,
  890. Targets: len(cronJobs),
  891. Errors: nil,
  892. })
  893. return scrapeResults
  894. }
  895. func (ccs *ClusterCacheScraper) GetScrapeReplicaSets(replicaSets []*clustercache.ReplicaSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  896. return func() []metric.Update {
  897. return ccs.scrapeReplicaSets(replicaSets, namespaceIndex)
  898. }
  899. }
  900. func (ccs *ClusterCacheScraper) scrapeReplicaSets(replicaSets []*clustercache.ReplicaSet, namespaceIndex map[string]types.UID) []metric.Update {
  901. var scrapeResults []metric.Update
  902. for _, replicaSet := range replicaSets {
  903. nsUID, ok := namespaceIndex[replicaSet.Namespace]
  904. if !ok {
  905. log.Debugf("replicaset namespaceUID missing from index for namespace name '%s'", replicaSet.Namespace)
  906. }
  907. replicaSetInfo := map[string]string{
  908. source.UIDLabel: string(replicaSet.UID),
  909. source.NamespaceUIDLabel: string(nsUID),
  910. source.ReplicaSetLabel: replicaSet.Name,
  911. }
  912. // replicaset info
  913. scrapeResults = append(scrapeResults, metric.Update{
  914. Name: metric.ReplicaSetInfo,
  915. Labels: replicaSetInfo,
  916. Value: 0,
  917. AdditionalInfo: replicaSetInfo,
  918. })
  919. // replicaset labels
  920. labelNames, labelValues := promutil.KubeLabelsToLabels(replicaSet.Labels)
  921. replicaSetLabels := util.ToMap(labelNames, labelValues)
  922. scrapeResults = append(scrapeResults, metric.Update{
  923. Name: metric.ReplicaSetLabels,
  924. Labels: replicaSetInfo,
  925. Value: 0,
  926. AdditionalInfo: replicaSetLabels,
  927. })
  928. // replicaset annotations
  929. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(replicaSet.Annotations)
  930. replicaSetAnnotations := util.ToMap(annotationNames, annotationValues)
  931. scrapeResults = append(scrapeResults, metric.Update{
  932. Name: metric.ReplicaSetAnnotations,
  933. Labels: replicaSetInfo,
  934. Value: 0,
  935. AdditionalInfo: replicaSetAnnotations,
  936. })
  937. // owner references for backward compatibility
  938. replicaSetOwnerInfo := map[string]string{
  939. source.ReplicaSetLabel: replicaSet.Name,
  940. source.NamespaceLabel: replicaSet.Namespace,
  941. source.UIDLabel: string(replicaSet.UID),
  942. }
  943. // this specific metric exports a special <none> value for name and kind
  944. // if there are no owners
  945. if len(replicaSet.OwnerReferences) == 0 {
  946. ownerInfo := maps.Clone(replicaSetOwnerInfo)
  947. ownerInfo[source.OwnerKindLabel] = source.NoneLabelValue
  948. ownerInfo[source.OwnerNameLabel] = source.NoneLabelValue
  949. scrapeResults = append(scrapeResults, metric.Update{
  950. Name: metric.KubeReplicasetOwner,
  951. Labels: ownerInfo,
  952. Value: 0,
  953. })
  954. } else {
  955. for _, owner := range replicaSet.OwnerReferences {
  956. controller := "false"
  957. if owner.Controller != nil && *owner.Controller {
  958. controller = "true"
  959. }
  960. ownerInfo := maps.Clone(replicaSetOwnerInfo)
  961. ownerInfo[source.OwnerKindLabel] = owner.Kind
  962. ownerInfo[source.OwnerNameLabel] = owner.Name
  963. ownerInfo[source.OwnerUIDLabel] = string(owner.UID)
  964. ownerInfo[source.ControllerLabel] = controller
  965. scrapeResults = append(scrapeResults, metric.Update{
  966. Name: metric.KubeReplicasetOwner,
  967. Labels: ownerInfo,
  968. Value: 0,
  969. })
  970. }
  971. }
  972. }
  973. events.Dispatch(event.ScrapeEvent{
  974. ScraperName: event.KubernetesClusterScraperName,
  975. ScrapeType: event.ReplicaSetScraperType,
  976. Targets: len(replicaSets),
  977. Errors: nil,
  978. })
  979. return scrapeResults
  980. }
  981. func (ccs *ClusterCacheScraper) GetScrapeResourceQuotas(resourceQuotas []*clustercache.ResourceQuota, namespaceIndex map[string]types.UID) ScrapeFunc {
  982. return func() []metric.Update {
  983. return ccs.scrapeResourceQuotas(resourceQuotas, namespaceIndex)
  984. }
  985. }
  986. func (ccs *ClusterCacheScraper) scrapeResourceQuotas(resourceQuotas []*clustercache.ResourceQuota, namespaceIndex map[string]types.UID) []metric.Update {
  987. var scrapeResults []metric.Update
  988. processResource := func(baseLabels map[string]string, name v1.ResourceName, quantity resource.Quantity, metricName string) metric.Update {
  989. resource, unit, value := toResourceUnitValue(name, quantity)
  990. labels := maps.Clone(baseLabels)
  991. labels[source.ResourceLabel] = resource
  992. labels[source.UnitLabel] = unit
  993. return metric.Update{
  994. Name: metricName,
  995. Labels: labels,
  996. Value: value,
  997. }
  998. }
  999. for _, resourceQuota := range resourceQuotas {
  1000. nsUID, _ := namespaceIndex[resourceQuota.Namespace]
  1001. resourceQuotaInfo := map[string]string{
  1002. source.UIDLabel: string(resourceQuota.UID),
  1003. source.NamespaceUIDLabel: string(nsUID),
  1004. source.ResourceQuotaLabel: resourceQuota.Name,
  1005. }
  1006. scrapeResults = append(scrapeResults, metric.Update{
  1007. Name: metric.ResourceQuotaInfo,
  1008. Labels: resourceQuotaInfo,
  1009. AdditionalInfo: resourceQuotaInfo,
  1010. Value: 0,
  1011. })
  1012. if resourceQuota.Spec.Hard != nil {
  1013. // CPU/memory requests can also be aliased as "cpu" and "memory". For now, however, only scrape the complete names
  1014. // https://kubernetes.io/docs/concepts/policy/resource-quotas/#compute-resource-quota
  1015. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceRequestsCPU]; ok {
  1016. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaSpecResourceRequests))
  1017. }
  1018. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceRequestsMemory]; ok {
  1019. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaSpecResourceRequests))
  1020. }
  1021. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceLimitsCPU]; ok {
  1022. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaSpecResourceLimits))
  1023. }
  1024. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceLimitsMemory]; ok {
  1025. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaSpecResourceLimits))
  1026. }
  1027. }
  1028. if resourceQuota.Status.Used != nil {
  1029. if quantity, ok := resourceQuota.Status.Used[v1.ResourceRequestsCPU]; ok {
  1030. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaStatusUsedResourceRequests))
  1031. }
  1032. if quantity, ok := resourceQuota.Status.Used[v1.ResourceRequestsMemory]; ok {
  1033. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaStatusUsedResourceRequests))
  1034. }
  1035. if quantity, ok := resourceQuota.Status.Used[v1.ResourceLimitsCPU]; ok {
  1036. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaStatusUsedResourceLimits))
  1037. }
  1038. if quantity, ok := resourceQuota.Status.Used[v1.ResourceLimitsMemory]; ok {
  1039. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaStatusUsedResourceLimits))
  1040. }
  1041. }
  1042. }
  1043. events.Dispatch(event.ScrapeEvent{
  1044. ScraperName: event.KubernetesClusterScraperName,
  1045. ScrapeType: event.ResourceQuotaScraperType,
  1046. Targets: len(resourceQuotas),
  1047. Errors: nil,
  1048. })
  1049. return scrapeResults
  1050. }
  1051. // PvcInfo is used to store information about a pvc for tracking volume usage.
  1052. type PvcInfo struct {
  1053. Class string
  1054. Claim string
  1055. Namespace string
  1056. VolumeName string
  1057. Requests float64
  1058. PodsClaimed []string
  1059. }
  1060. func getPvcsInfo(pvcs []*clustercache.PersistentVolumeClaim) map[string]*PvcInfo {
  1061. toReturn := make(map[string]*PvcInfo)
  1062. for _, pvc := range pvcs {
  1063. ns := pvc.Namespace
  1064. pvcName := pvc.Name
  1065. volumeName := pvc.Spec.VolumeName
  1066. pvClass := getPersistentVolumeClaimClass(pvc)
  1067. requests := float64(pvc.Spec.Resources.Requests.Storage().Value())
  1068. key := ns + "," + pvcName
  1069. toReturn[key] = &PvcInfo{
  1070. Class: pvClass,
  1071. Claim: pvcName,
  1072. Namespace: ns,
  1073. VolumeName: volumeName,
  1074. Requests: requests,
  1075. }
  1076. }
  1077. return toReturn
  1078. }
  1079. // NodeGpuInfo contains the gpu count and vgpu counts for nodes
  1080. type NodeGpuInfo struct {
  1081. GPU float64
  1082. VGPU float64
  1083. }
  1084. func (ccs *ClusterCacheScraper) getNodesGpuInfo() map[string]*NodeGpuInfo {
  1085. // use a closure to cache allocatableVGPU result instead of calculating
  1086. // it every time we need it
  1087. var allocatableVGPUs *float64
  1088. allocVGPUs := func() (float64, error) {
  1089. if allocatableVGPUs != nil {
  1090. return *allocatableVGPUs, nil
  1091. }
  1092. vgpu, err := getAllocatableVGPUs(ccs.clusterCache.GetAllDaemonSets())
  1093. if err != nil {
  1094. return vgpu, err
  1095. }
  1096. allocatableVGPUs = &vgpu
  1097. return *allocatableVGPUs, nil
  1098. }
  1099. var nodeGpuMap map[string]*NodeGpuInfo = make(map[string]*NodeGpuInfo)
  1100. for _, node := range ccs.clusterCache.GetAllNodes() {
  1101. info, err := gpuInfoFor(node, allocVGPUs)
  1102. if err != nil {
  1103. log.Warnf("Failed to retrieve GPU Info for Node: %s - %s", node.Name, err)
  1104. continue
  1105. }
  1106. nodeGpuMap[node.Name] = info
  1107. }
  1108. return nodeGpuMap
  1109. }
  1110. // getPersistentVolumeClaimClass returns StorageClassName. If no storage class was
  1111. // requested, it returns "".
  1112. func getPersistentVolumeClaimClass(claim *clustercache.PersistentVolumeClaim) string {
  1113. // Use beta annotation first
  1114. if class, found := claim.Annotations[v1.BetaStorageClassAnnotation]; found {
  1115. return class
  1116. }
  1117. if claim.Spec.StorageClassName != nil {
  1118. return *claim.Spec.StorageClassName
  1119. }
  1120. // Special non-empty string to indicate absence of storage class.
  1121. return ""
  1122. }
  1123. // toResourceUnitValue accepts a resource name and quantity and returns the sanitized resource, the unit, and the value in the units.
  1124. // Returns an empty string for resource and unit if there was a failure.
  1125. func toResourceUnitValue(resourceName v1.ResourceName, quantity resource.Quantity) (resource string, unit string, value float64) {
  1126. resource = promutil.SanitizeLabelName(string(resourceName))
  1127. switch resourceName {
  1128. case v1.ResourceCPU:
  1129. unit = "core"
  1130. value = float64(quantity.MilliValue()) / 1000
  1131. return
  1132. case v1.ResourceStorage:
  1133. fallthrough
  1134. case v1.ResourceEphemeralStorage:
  1135. fallthrough
  1136. case v1.ResourceMemory:
  1137. unit = "byte"
  1138. value = float64(quantity.Value())
  1139. return
  1140. case v1.ResourcePods:
  1141. unit = "integer"
  1142. value = float64(quantity.Value())
  1143. return
  1144. default:
  1145. if isHugePageResourceName(resourceName) || isAttachableVolumeResourceName(resourceName) {
  1146. unit = "byte"
  1147. value = float64(quantity.Value())
  1148. return
  1149. }
  1150. if isExtendedResourceName(resourceName) {
  1151. unit = "integer"
  1152. value = float64(quantity.Value())
  1153. return
  1154. }
  1155. }
  1156. resource = ""
  1157. unit = ""
  1158. value = 0.0
  1159. return
  1160. }
  1161. func isGpuResourceName(name v1.ResourceName) bool {
  1162. return name == "nvidia.com/gpu" || name == "k8s.amazonaws.com/vgpu"
  1163. }
  1164. // isHugePageResourceName checks for a huge page container resource name
  1165. func isHugePageResourceName(name v1.ResourceName) bool {
  1166. return strings.HasPrefix(string(name), v1.ResourceHugePagesPrefix)
  1167. }
  1168. // isAttachableVolumeResourceName checks for attached volume container resource name
  1169. func isAttachableVolumeResourceName(name v1.ResourceName) bool {
  1170. return strings.HasPrefix(string(name), v1.ResourceAttachableVolumesPrefix)
  1171. }
  1172. // isExtendedResourceName checks for extended container resource name
  1173. func isExtendedResourceName(name v1.ResourceName) bool {
  1174. if isNativeResource(name) || strings.HasPrefix(string(name), v1.DefaultResourceRequestsPrefix) {
  1175. return false
  1176. }
  1177. // Ensure it satisfies the rules in IsQualifiedName() after converted into quota resource name
  1178. nameForQuota := fmt.Sprintf("%s%s", v1.DefaultResourceRequestsPrefix, string(name))
  1179. if errs := validation.IsQualifiedName(nameForQuota); len(errs) != 0 {
  1180. return false
  1181. }
  1182. return true
  1183. }
  1184. // isNativeResource checks for a kubernetes.io/ prefixed resource name
  1185. func isNativeResource(name v1.ResourceName) bool {
  1186. return !strings.Contains(string(name), "/") || isPrefixedNativeResource(name)
  1187. }
  1188. func isPrefixedNativeResource(name v1.ResourceName) bool {
  1189. return strings.Contains(string(name), v1.ResourceDefaultNamespacePrefix)
  1190. }
  1191. // gets the Node GPUs and VGPUs using the node data from k8s. Returns nil if GPUs could not be located for the node.
  1192. func gpuInfoFor(
  1193. n *clustercache.Node,
  1194. allocatedVGPUs func() (float64, error),
  1195. ) (*NodeGpuInfo, error) {
  1196. g, hasGpu := n.Status.Capacity["nvidia.com/gpu"]
  1197. _, hasReplicas := n.Labels["nvidia.com/gpu.replicas"]
  1198. // Case 1: Standard NVIDIA GPU
  1199. if hasGpu && g.Value() != 0 && !hasReplicas {
  1200. return &NodeGpuInfo{
  1201. GPU: float64(g.Value()),
  1202. VGPU: float64(g.Value()),
  1203. }, nil
  1204. }
  1205. // Case 2: NVIDIA GPU with GPU Feature Discovery (GFD) Pod enabled.
  1206. // Ref: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html#verifying-the-gpu-time-slicing-configuration
  1207. // Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L44-L45
  1208. // Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L103-L118
  1209. if hasReplicas {
  1210. resultGPU := 0.0
  1211. resultVGPU := 0.0
  1212. if c, ok := n.Labels["nvidia.com/gpu.count"]; ok {
  1213. var err error
  1214. resultGPU, err = strconv.ParseFloat(c, 64)
  1215. if err != nil {
  1216. return nil, fmt.Errorf("could not parse label \"nvidia.com/gpu.count\": %v", err)
  1217. }
  1218. }
  1219. if s, ok := n.Status.Capacity["nvidia.com/gpu.shared"]; ok { // GFD configured `renameByDefault=true`
  1220. resultVGPU = float64(s.Value())
  1221. } else if g, ok := n.Status.Capacity["nvidia.com/gpu"]; ok { // GFD configured `renameByDefault=false`
  1222. resultVGPU = float64(g.Value())
  1223. } else {
  1224. resultVGPU = resultGPU
  1225. }
  1226. return &NodeGpuInfo{
  1227. GPU: resultGPU,
  1228. VGPU: resultVGPU,
  1229. }, nil
  1230. }
  1231. // Case 3: AWS vGPU
  1232. if vgpu, ok := n.Status.Capacity["k8s.amazonaws.com/vgpu"]; ok {
  1233. vgpuCount, err := allocatedVGPUs()
  1234. if err != nil {
  1235. return nil, err
  1236. }
  1237. vgpuCoeff := 10.0
  1238. if vgpuCount > 0.0 {
  1239. vgpuCoeff = vgpuCount
  1240. }
  1241. if vgpu.Value() != 0 {
  1242. resultGPU := float64(vgpu.Value()) / vgpuCoeff
  1243. resultVGPU := float64(vgpu.Value())
  1244. return &NodeGpuInfo{
  1245. GPU: resultGPU,
  1246. VGPU: resultVGPU,
  1247. }, nil
  1248. }
  1249. }
  1250. // No GPU found
  1251. return nil, nil
  1252. }
  1253. func getAllocatableVGPUs(daemonsets []*clustercache.DaemonSet) (float64, error) {
  1254. vgpuCount := 0.0
  1255. for _, ds := range daemonsets {
  1256. dsContainerList := &ds.SpecContainers
  1257. for _, ctnr := range *dsContainerList {
  1258. if ctnr.Args != nil {
  1259. for _, arg := range ctnr.Args {
  1260. if strings.Contains(arg, "--vgpu=") {
  1261. vgpus, err := strconv.ParseFloat(arg[strings.IndexByte(arg, '=')+1:], 64)
  1262. if err != nil {
  1263. log.Errorf("failed to parse vgpu allocation string %s: %v", arg, err)
  1264. continue
  1265. }
  1266. vgpuCount = vgpus
  1267. return vgpuCount, nil
  1268. }
  1269. }
  1270. }
  1271. }
  1272. }
  1273. return vgpuCount, nil
  1274. }