clustercache.go 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452
  1. package scrape
  2. import (
  3. "fmt"
  4. "slices"
  5. "strconv"
  6. "strings"
  7. "github.com/kubecost/events"
  8. "github.com/opencost/opencost/core/pkg/clustercache"
  9. "github.com/opencost/opencost/core/pkg/log"
  10. "github.com/opencost/opencost/core/pkg/source"
  11. coreutil "github.com/opencost/opencost/core/pkg/util"
  12. "github.com/opencost/opencost/core/pkg/util/promutil"
  13. "github.com/opencost/opencost/modules/collector-source/pkg/event"
  14. "github.com/opencost/opencost/modules/collector-source/pkg/metric"
  15. "github.com/opencost/opencost/modules/collector-source/pkg/util"
  16. "golang.org/x/exp/maps"
  17. v1 "k8s.io/api/core/v1"
  18. "k8s.io/apimachinery/pkg/api/resource"
  19. "k8s.io/apimachinery/pkg/types"
  20. "k8s.io/apimachinery/pkg/util/validation"
  21. )
  22. const unmountedPVsContainer = "unmounted-pvs"
  23. type ClusterCacheScraper struct {
  24. clusterCache clustercache.ClusterCache
  25. }
  26. func newClusterCacheScraper(clusterCache clustercache.ClusterCache) Scraper {
  27. return &ClusterCacheScraper{
  28. clusterCache: clusterCache,
  29. }
  30. }
  31. func (ccs *ClusterCacheScraper) Scrape() []metric.Update {
  32. // retrieve objects for scrape
  33. nodes := ccs.clusterCache.GetAllNodes()
  34. deployments := ccs.clusterCache.GetAllDeployments()
  35. namespaces := ccs.clusterCache.GetAllNamespaces()
  36. pods := ccs.clusterCache.GetAllPods()
  37. pvcs := ccs.clusterCache.GetAllPersistentVolumeClaims()
  38. pvs := ccs.clusterCache.GetAllPersistentVolumes()
  39. services := ccs.clusterCache.GetAllServices()
  40. statefulSets := ccs.clusterCache.GetAllStatefulSets()
  41. daemonSets := ccs.clusterCache.GetAllDaemonSets()
  42. jobs := ccs.clusterCache.GetAllJobs()
  43. cronJobs := ccs.clusterCache.GetAllCronJobs()
  44. replicaSets := ccs.clusterCache.GetAllReplicaSets()
  45. resourceQuotas := ccs.clusterCache.GetAllResourceQuotas()
  46. // create scrape indexes. While the pairs being mapped here don't have a 1 to 1 relationship in the general case,
  47. // we are assuming that in the context of a single snapshot of the cluster they are 1 to 1.
  48. nodeNameToUID := buildNodeIndex(nodes)
  49. namespaceNameToUID := buildNamespaceIndex(namespaces)
  50. pvcNameToUID := buildPVCIndex(pvcs)
  51. pvNameToUID := buildPVIndex(pvs)
  52. scrapeFuncs := []ScrapeFunc{
  53. ccs.GetScrapeNodes(nodes),
  54. ccs.GetScrapeDeployments(deployments, namespaceNameToUID),
  55. ccs.GetScrapeNamespaces(namespaces),
  56. ccs.GetScrapePods(pods, pvcs, nodeNameToUID, namespaceNameToUID, pvcNameToUID),
  57. ccs.GetScrapePVCs(pvcs, namespaceNameToUID, pvNameToUID),
  58. ccs.GetScrapePVs(pvs),
  59. ccs.GetScrapeServices(services),
  60. ccs.GetScrapeStatefulSets(statefulSets, namespaceNameToUID),
  61. ccs.GetScrapeDaemonSets(daemonSets, namespaceNameToUID),
  62. ccs.GetScrapeJobs(jobs, namespaceNameToUID),
  63. ccs.GetScrapeCronJobs(cronJobs, namespaceNameToUID),
  64. ccs.GetScrapeReplicaSets(replicaSets, namespaceNameToUID),
  65. ccs.GetScrapeResourceQuotas(resourceQuotas, namespaceNameToUID),
  66. }
  67. return concurrentScrape(scrapeFuncs...)
  68. }
  69. func (ccs *ClusterCacheScraper) GetScrapeNodes(nodes []*clustercache.Node) ScrapeFunc {
  70. return func() []metric.Update {
  71. return ccs.scrapeNodes(nodes)
  72. }
  73. }
  74. func (ccs *ClusterCacheScraper) scrapeNodes(nodes []*clustercache.Node) []metric.Update {
  75. var scrapeResults []metric.Update
  76. for _, node := range nodes {
  77. nodeInfo := map[string]string{
  78. source.NodeLabel: node.Name,
  79. source.ProviderIDLabel: node.SpecProviderID,
  80. source.UIDLabel: string(node.UID),
  81. }
  82. if instanceType, ok := coreutil.GetInstanceType(node.Labels); ok {
  83. nodeInfo[source.InstanceTypeLabel] = instanceType
  84. }
  85. scrapeResults = append(scrapeResults, metric.Update{
  86. Name: metric.NodeInfo,
  87. Labels: nodeInfo,
  88. AdditionalInfo: nodeInfo,
  89. })
  90. // Node Capacity
  91. scrapeResults = scrapeResourceList(
  92. metric.NodeResourceCapacities,
  93. node.Status.Capacity,
  94. nodeInfo,
  95. scrapeResults)
  96. // This block and metric can be removed, when we stop exporting assets and allocations
  97. if node.Status.Capacity != nil {
  98. if quantity, ok := node.Status.Capacity[v1.ResourceCPU]; ok {
  99. _, _, value := toResourceUnitValue(v1.ResourceCPU, quantity)
  100. scrapeResults = append(scrapeResults, metric.Update{
  101. Name: metric.KubeNodeStatusCapacityCPUCores,
  102. Labels: nodeInfo,
  103. Value: value,
  104. })
  105. }
  106. if quantity, ok := node.Status.Capacity[v1.ResourceMemory]; ok {
  107. _, _, value := toResourceUnitValue(v1.ResourceMemory, quantity)
  108. scrapeResults = append(scrapeResults, metric.Update{
  109. Name: metric.KubeNodeStatusCapacityMemoryBytes,
  110. Labels: nodeInfo,
  111. Value: value,
  112. })
  113. }
  114. }
  115. // Node Allocatable Resources
  116. scrapeResults = scrapeResourceList(
  117. metric.NodeResourcesAllocatable,
  118. node.Status.Allocatable,
  119. nodeInfo,
  120. scrapeResults)
  121. // This block and metric can be removed, when we stop exporting assets and allocations
  122. if node.Status.Allocatable != nil {
  123. if quantity, ok := node.Status.Allocatable[v1.ResourceCPU]; ok {
  124. _, _, value := toResourceUnitValue(v1.ResourceCPU, quantity)
  125. scrapeResults = append(scrapeResults, metric.Update{
  126. Name: metric.KubeNodeStatusAllocatableCPUCores,
  127. Labels: nodeInfo,
  128. Value: value,
  129. })
  130. }
  131. if quantity, ok := node.Status.Allocatable[v1.ResourceMemory]; ok {
  132. _, _, value := toResourceUnitValue(v1.ResourceMemory, quantity)
  133. scrapeResults = append(scrapeResults, metric.Update{
  134. Name: metric.KubeNodeStatusAllocatableMemoryBytes,
  135. Labels: nodeInfo,
  136. Value: value,
  137. })
  138. }
  139. }
  140. // node labels
  141. labelNames, labelValues := promutil.KubeLabelsToLabels(node.Labels)
  142. nodeLabels := util.ToMap(labelNames, labelValues)
  143. scrapeResults = append(scrapeResults, metric.Update{
  144. Name: metric.KubeNodeLabels,
  145. Labels: nodeInfo,
  146. Value: 0,
  147. AdditionalInfo: nodeLabels,
  148. })
  149. }
  150. events.Dispatch(event.ScrapeEvent{
  151. ScraperName: event.KubernetesClusterScraperName,
  152. ScrapeType: event.NodeScraperType,
  153. Targets: len(nodes),
  154. Errors: nil,
  155. })
  156. return scrapeResults
  157. }
  158. func (ccs *ClusterCacheScraper) GetScrapeDeployments(deployments []*clustercache.Deployment, namespaceIndex map[string]types.UID) ScrapeFunc {
  159. return func() []metric.Update {
  160. return ccs.scrapeDeployments(deployments, namespaceIndex)
  161. }
  162. }
  163. func (ccs *ClusterCacheScraper) scrapeDeployments(deployments []*clustercache.Deployment, namespaceIndex map[string]types.UID) []metric.Update {
  164. var scrapeResults []metric.Update
  165. for _, deployment := range deployments {
  166. nsUID, ok := namespaceIndex[deployment.Namespace]
  167. if !ok {
  168. log.Debugf("deployment namespaceUID missing from index for namespace name '%s'", deployment.Namespace)
  169. }
  170. deploymentInfo := map[string]string{
  171. source.UIDLabel: string(deployment.UID),
  172. source.NamespaceUIDLabel: string(nsUID),
  173. source.NamespaceLabel: deployment.Namespace,
  174. source.DeploymentLabel: deployment.Name,
  175. }
  176. scrapeResults = append(scrapeResults, metric.Update{
  177. Name: metric.DeploymentInfo,
  178. Labels: deploymentInfo,
  179. Value: 0,
  180. AdditionalInfo: deploymentInfo,
  181. })
  182. // deployment labels
  183. labelNames, labelValues := promutil.KubeLabelsToLabels(deployment.Labels)
  184. deploymentLabels := util.ToMap(labelNames, labelValues)
  185. scrapeResults = append(scrapeResults, metric.Update{
  186. Name: metric.DeploymentLabels,
  187. Labels: deploymentInfo,
  188. Value: 0,
  189. AdditionalInfo: deploymentLabels,
  190. })
  191. // deployment annotations
  192. annoationNames, annotationValues := promutil.KubeAnnotationsToLabels(deployment.Annotations)
  193. deploymentAnnotations := util.ToMap(annoationNames, annotationValues)
  194. scrapeResults = append(scrapeResults, metric.Update{
  195. Name: metric.DeploymentAnnotations,
  196. Labels: deploymentInfo,
  197. Value: 0,
  198. AdditionalInfo: deploymentAnnotations,
  199. })
  200. // deployment match labels
  201. matchLabelNames, matchLabelValues := promutil.KubeLabelsToLabels(deployment.MatchLabels)
  202. deploymentMatchLabels := util.ToMap(matchLabelNames, matchLabelValues)
  203. scrapeResults = append(scrapeResults, metric.Update{
  204. Name: metric.DeploymentMatchLabels,
  205. Labels: deploymentInfo,
  206. Value: 0,
  207. AdditionalInfo: deploymentMatchLabels,
  208. })
  209. }
  210. events.Dispatch(event.ScrapeEvent{
  211. ScraperName: event.KubernetesClusterScraperName,
  212. ScrapeType: event.DeploymentScraperType,
  213. Targets: len(deployments),
  214. Errors: nil,
  215. })
  216. return scrapeResults
  217. }
  218. func (ccs *ClusterCacheScraper) GetScrapeNamespaces(namespaces []*clustercache.Namespace) ScrapeFunc {
  219. return func() []metric.Update {
  220. return ccs.scrapeNamespaces(namespaces)
  221. }
  222. }
  223. func (ccs *ClusterCacheScraper) scrapeNamespaces(namespaces []*clustercache.Namespace) []metric.Update {
  224. var scrapeResults []metric.Update
  225. for _, namespace := range namespaces {
  226. namespaceInfo := map[string]string{
  227. source.NamespaceLabel: namespace.Name,
  228. source.UIDLabel: string(namespace.UID),
  229. }
  230. scrapeResults = append(scrapeResults, metric.Update{
  231. Name: metric.NamespaceInfo,
  232. Labels: namespaceInfo,
  233. AdditionalInfo: namespaceInfo,
  234. Value: 0,
  235. })
  236. // namespace labels
  237. labelNames, labelValues := promutil.KubeLabelsToLabels(namespace.Labels)
  238. namespaceLabels := util.ToMap(labelNames, labelValues)
  239. scrapeResults = append(scrapeResults, metric.Update{
  240. Name: metric.KubeNamespaceLabels,
  241. Labels: namespaceInfo,
  242. Value: 0,
  243. AdditionalInfo: namespaceLabels,
  244. })
  245. // namespace annotations
  246. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(namespace.Annotations)
  247. namespaceAnnotations := util.ToMap(annotationNames, annotationValues)
  248. scrapeResults = append(scrapeResults, metric.Update{
  249. Name: metric.KubeNamespaceAnnotations,
  250. Labels: namespaceInfo,
  251. Value: 0,
  252. AdditionalInfo: namespaceAnnotations,
  253. })
  254. }
  255. events.Dispatch(event.ScrapeEvent{
  256. ScraperName: event.KubernetesClusterScraperName,
  257. ScrapeType: event.NamespaceScraperType,
  258. Targets: len(namespaces),
  259. Errors: nil,
  260. })
  261. return scrapeResults
  262. }
  263. func (ccs *ClusterCacheScraper) GetScrapePods(
  264. pods []*clustercache.Pod,
  265. pvcs []*clustercache.PersistentVolumeClaim,
  266. nodeIndex map[string]types.UID,
  267. namespaceIndex map[string]types.UID,
  268. pvcIndex map[pvcKey]types.UID,
  269. ) ScrapeFunc {
  270. return func() []metric.Update {
  271. return ccs.scrapePods(pods, pvcs, nodeIndex, namespaceIndex, pvcIndex)
  272. }
  273. }
  274. func (ccs *ClusterCacheScraper) scrapePods(
  275. pods []*clustercache.Pod,
  276. pvcs []*clustercache.PersistentVolumeClaim,
  277. nodeIndex map[string]types.UID,
  278. namespaceIndex map[string]types.UID,
  279. pvcIndex map[pvcKey]types.UID,
  280. ) []metric.Update {
  281. // this is only populated if we find gpu resources being requested
  282. var nodesGpuInfo map[string]*NodeGpuInfo
  283. // pv allocation and unmounted pvs
  284. pvcInfo := getPvcsInfo(pvcs)
  285. // pod info by uid
  286. podInfoByUid := make(map[string]map[string]string)
  287. var scrapeResults []metric.Update
  288. for _, pod := range pods {
  289. nodeUID, ok := nodeIndex[pod.Spec.NodeName]
  290. if !ok {
  291. log.Debugf("pod nodeUID missing from index for node name '%s'", pod.Spec.NodeName)
  292. }
  293. nsUID, ok := namespaceIndex[pod.Namespace]
  294. if !ok {
  295. log.Debugf("pod namespaceUID missing from index for namespace name '%s'", pod.Namespace)
  296. }
  297. podInfo := map[string]string{
  298. source.UIDLabel: string(pod.UID),
  299. source.PodLabel: pod.Name,
  300. source.NamespaceUIDLabel: string(nsUID),
  301. source.NodeUIDLabel: string(nodeUID),
  302. }
  303. scrapeResults = append(scrapeResults, metric.Update{
  304. Name: metric.PodInfo,
  305. Labels: podInfo,
  306. Value: 0,
  307. AdditionalInfo: podInfo,
  308. })
  309. podInfo[source.NamespaceLabel] = pod.Namespace
  310. podInfo[source.NodeLabel] = pod.Spec.NodeName
  311. podInfo[source.InstanceLabel] = pod.Spec.NodeName
  312. podInfoByUid[string(pod.UID)] = podInfo
  313. // pod labels
  314. labelNames, labelValues := promutil.KubeLabelsToLabels(pod.Labels)
  315. podLabels := util.ToMap(labelNames, labelValues)
  316. scrapeResults = append(scrapeResults, metric.Update{
  317. Name: metric.KubePodLabels,
  318. Labels: podInfo,
  319. Value: 0,
  320. AdditionalInfo: podLabels,
  321. })
  322. // pod annotations
  323. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(pod.Annotations)
  324. podAnnotations := util.ToMap(annotationNames, annotationValues)
  325. scrapeResults = append(scrapeResults, metric.Update{
  326. Name: metric.KubePodAnnotations,
  327. Labels: podInfo,
  328. Value: 0,
  329. AdditionalInfo: podAnnotations,
  330. })
  331. // Determine PVC use data for Pod
  332. claimed := make(map[string]struct{})
  333. for _, volume := range pod.Spec.Volumes {
  334. if volume.PersistentVolumeClaim != nil {
  335. name := volume.PersistentVolumeClaim.ClaimName
  336. key := pod.Namespace + "," + name
  337. if _, seen := claimed[key]; seen {
  338. continue
  339. }
  340. if pvc, ok := pvcInfo[key]; ok {
  341. pvc.PodsClaimed = append(pvc.PodsClaimed, string(pod.UID))
  342. claimed[key] = struct{}{}
  343. }
  344. }
  345. }
  346. // Pod owner metric
  347. for _, owner := range pod.OwnerReferences {
  348. controller := "false"
  349. if owner.Controller != nil && *owner.Controller {
  350. controller = "true"
  351. }
  352. ownerInfo := maps.Clone(podInfo)
  353. ownerInfo[source.OwnerKindLabel] = owner.Kind
  354. ownerInfo[source.OwnerNameLabel] = owner.Name
  355. ownerInfo[source.OwnerUIDLabel] = string(owner.UID)
  356. ownerInfo[source.ContainerLabel] = controller
  357. scrapeResults = append(scrapeResults, metric.Update{
  358. Name: metric.KubePodOwner,
  359. Labels: ownerInfo,
  360. Value: 0,
  361. })
  362. }
  363. // Container Status
  364. for _, status := range pod.Status.ContainerStatuses {
  365. if status.State.Running != nil {
  366. containerInfo := maps.Clone(podInfo)
  367. containerInfo[source.ContainerLabel] = status.Name
  368. scrapeResults = append(scrapeResults, metric.Update{
  369. Name: metric.KubePodContainerStatusRunning,
  370. Labels: containerInfo,
  371. AdditionalInfo: containerInfo,
  372. Value: 0,
  373. })
  374. }
  375. }
  376. for _, volume := range pod.Spec.Volumes {
  377. if volume.PersistentVolumeClaim != nil {
  378. pvcUID, ok := pvcIndex[pvcKey{
  379. name: volume.PersistentVolumeClaim.ClaimName,
  380. namespace: pod.Namespace,
  381. }]
  382. if !ok {
  383. continue
  384. }
  385. podPVCVolumeInfo := map[string]string{
  386. source.UIDLabel: string(pod.UID),
  387. source.PVCUIDLabel: string(pvcUID),
  388. source.PodVolumeNameLabel: volume.Name,
  389. }
  390. scrapeResults = append(scrapeResults, metric.Update{
  391. Name: metric.PodPVCVolume,
  392. Labels: podPVCVolumeInfo,
  393. Value: 0,
  394. })
  395. }
  396. }
  397. for _, container := range pod.Spec.Containers {
  398. containerInfo := maps.Clone(podInfo)
  399. containerInfo[source.ContainerLabel] = container.Name
  400. // Requests
  401. scrapeResults = scrapeResourceList(
  402. metric.KubePodContainerResourceRequests,
  403. container.Resources.Requests,
  404. containerInfo,
  405. scrapeResults)
  406. // Limits
  407. scrapeResults = scrapeResourceList(
  408. metric.KubePodContainerResourceLimits,
  409. container.Resources.Limits,
  410. containerInfo,
  411. scrapeResults)
  412. // Todo remove when asset/allocation pipeline are removed
  413. // gpu "requests" is either the request or limit if it exists
  414. var gpuRequest *float64
  415. for resourceName, quantity := range container.Resources.Requests {
  416. if isGpuResourceName(resourceName) {
  417. // set gpu request if it exists
  418. _, _, value := toResourceUnitValue(resourceName, quantity)
  419. gpuRequestValue := value
  420. gpuRequest = &gpuRequestValue
  421. break
  422. }
  423. }
  424. // Limits
  425. if gpuRequest == nil {
  426. for resourceName, quantity := range container.Resources.Limits {
  427. if isGpuResourceName(resourceName) {
  428. // set gpu request if it exists
  429. _, _, value := toResourceUnitValue(resourceName, quantity)
  430. gpuRequestValue := value
  431. gpuRequest = &gpuRequestValue
  432. break
  433. }
  434. }
  435. }
  436. // handle the GPU allocation metric here IFF there exists a request/limit for GPUs
  437. // we only load the node gpu data map if we run into a container with gpu requests/limits
  438. if gpuRequest != nil {
  439. if nodesGpuInfo == nil {
  440. nodesGpuInfo = ccs.getNodesGpuInfo()
  441. }
  442. gpuAlloc := *gpuRequest
  443. if nodeGpuInfo, ok := nodesGpuInfo[pod.Spec.NodeName]; ok {
  444. if nodeGpuInfo != nil && nodeGpuInfo.VGPU != 0 {
  445. gpuAlloc = gpuAlloc * (nodeGpuInfo.GPU / nodeGpuInfo.VGPU)
  446. }
  447. }
  448. scrapeResults = append(scrapeResults, metric.Update{
  449. Name: metric.ContainerGPUAllocation,
  450. Labels: maps.Clone(containerInfo),
  451. Value: gpuAlloc,
  452. })
  453. }
  454. }
  455. }
  456. // Iterate through PVC Info after the pods have been tallied and export
  457. // allocation metrics based on the number of other pods claiming the volume
  458. for _, pvc := range pvcInfo {
  459. // unmounted pvs get full allocation
  460. if len(pvc.PodsClaimed) == 0 {
  461. labels := map[string]string{
  462. source.PodLabel: unmountedPVsContainer,
  463. source.NamespaceLabel: pvc.Namespace,
  464. source.PVCLabel: pvc.Claim,
  465. source.PVLabel: pvc.VolumeName,
  466. }
  467. scrapeResults = append(scrapeResults, metric.Update{
  468. Name: metric.PodPVCAllocation,
  469. Labels: labels,
  470. Value: pvc.Requests,
  471. })
  472. continue
  473. }
  474. // pods get a proportion of pv allocation
  475. value := pvc.Requests / float64(len(pvc.PodsClaimed))
  476. for _, podUid := range pvc.PodsClaimed {
  477. podInfo, ok := podInfoByUid[podUid]
  478. if !ok {
  479. continue
  480. }
  481. pvcLabels := maps.Clone(podInfo)
  482. pvcLabels[source.PVCLabel] = pvc.Claim
  483. pvcLabels[source.PVLabel] = pvc.VolumeName
  484. scrapeResults = append(scrapeResults, metric.Update{
  485. Name: metric.PodPVCAllocation,
  486. Labels: pvcLabels,
  487. Value: value,
  488. })
  489. }
  490. }
  491. events.Dispatch(event.ScrapeEvent{
  492. ScraperName: event.KubernetesClusterScraperName,
  493. ScrapeType: event.PodScraperType,
  494. Targets: len(pods),
  495. Errors: nil,
  496. })
  497. return scrapeResults
  498. }
  499. func scrapeResourceList(metricName string, resourceList v1.ResourceList, baseLabels map[string]string, scrapeResults []metric.Update) []metric.Update {
  500. if resourceList != nil {
  501. // sorting keys here for testing purposes
  502. keys := maps.Keys(resourceList)
  503. slices.Sort(keys)
  504. for _, resourceName := range keys {
  505. quantity := resourceList[resourceName]
  506. resource, unit, value := toResourceUnitValue(resourceName, quantity)
  507. // failed to parse the resource type
  508. if resource == "" {
  509. log.DedupedWarningf(5, "Failed to parse resource units and quantity for resource: %s", resourceName)
  510. continue
  511. }
  512. resourceRequestInfo := maps.Clone(baseLabels)
  513. resourceRequestInfo[source.ResourceLabel] = resource
  514. resourceRequestInfo[source.UnitLabel] = unit
  515. scrapeResults = append(scrapeResults, metric.Update{
  516. Name: metricName,
  517. Labels: resourceRequestInfo,
  518. Value: value,
  519. })
  520. }
  521. }
  522. return scrapeResults
  523. }
  524. func (ccs *ClusterCacheScraper) GetScrapePVCs(
  525. pvcs []*clustercache.PersistentVolumeClaim,
  526. namespaceIndex map[string]types.UID,
  527. pvIndex map[string]types.UID,
  528. ) ScrapeFunc {
  529. return func() []metric.Update {
  530. return ccs.scrapePVCs(pvcs, namespaceIndex, pvIndex)
  531. }
  532. }
  533. func (ccs *ClusterCacheScraper) scrapePVCs(
  534. pvcs []*clustercache.PersistentVolumeClaim,
  535. namespaceIndex map[string]types.UID,
  536. pvIndex map[string]types.UID,
  537. ) []metric.Update {
  538. var scrapeResults []metric.Update
  539. for _, pvc := range pvcs {
  540. nsUID, ok := namespaceIndex[pvc.Namespace]
  541. if !ok {
  542. log.Debugf("pvc namespaceUID missing from index for namespace name '%s'", pvc.Namespace)
  543. }
  544. pvUID, ok := pvIndex[pvc.Spec.VolumeName]
  545. if !ok && pvc.Spec.VolumeName != "" {
  546. log.Debugf("pvc volume name missing from index for pv name '%s'", pvc.Spec.VolumeName)
  547. }
  548. pvcInfo := map[string]string{
  549. source.UIDLabel: string(pvc.UID),
  550. source.PVCLabel: pvc.Name,
  551. source.NamespaceUIDLabel: string(nsUID),
  552. source.NamespaceLabel: pvc.Namespace,
  553. source.VolumeNameLabel: pvc.Spec.VolumeName,
  554. source.PVUIDLabel: string(pvUID),
  555. source.StorageClassLabel: getPersistentVolumeClaimClass(pvc),
  556. }
  557. scrapeResults = append(scrapeResults, metric.Update{
  558. Name: metric.KubePersistentVolumeClaimInfo,
  559. Labels: pvcInfo,
  560. AdditionalInfo: pvcInfo,
  561. Value: 0,
  562. })
  563. if storage, ok := pvc.Spec.Resources.Requests[v1.ResourceStorage]; ok {
  564. scrapeResults = append(scrapeResults, metric.Update{
  565. Name: metric.KubePersistentVolumeClaimResourceRequestsStorageBytes,
  566. Labels: pvcInfo,
  567. Value: float64(storage.Value()),
  568. })
  569. }
  570. }
  571. events.Dispatch(event.ScrapeEvent{
  572. ScraperName: event.KubernetesClusterScraperName,
  573. ScrapeType: event.PvcScraperType,
  574. Targets: len(pvcs),
  575. Errors: nil,
  576. })
  577. return scrapeResults
  578. }
  579. func (ccs *ClusterCacheScraper) GetScrapePVs(pvs []*clustercache.PersistentVolume) ScrapeFunc {
  580. return func() []metric.Update {
  581. return ccs.scrapePVs(pvs)
  582. }
  583. }
  584. func (ccs *ClusterCacheScraper) scrapePVs(pvs []*clustercache.PersistentVolume) []metric.Update {
  585. var scrapeResults []metric.Update
  586. for _, pv := range pvs {
  587. providerID := pv.Name
  588. var csiVolumeHandle string
  589. // if a more accurate provider ID is available, use that
  590. if pv.Spec.CSI != nil && pv.Spec.CSI.VolumeHandle != "" {
  591. providerID = pv.Spec.CSI.VolumeHandle
  592. csiVolumeHandle = pv.Spec.CSI.VolumeHandle
  593. }
  594. pvInfo := map[string]string{
  595. source.UIDLabel: string(pv.UID),
  596. source.PVLabel: pv.Name,
  597. source.StorageClassLabel: pv.Spec.StorageClassName,
  598. source.ProviderIDLabel: providerID,
  599. source.CSIVolumeHandleLabel: csiVolumeHandle,
  600. }
  601. scrapeResults = append(scrapeResults, metric.Update{
  602. Name: metric.KubecostPVInfo,
  603. Labels: pvInfo,
  604. AdditionalInfo: pvInfo,
  605. Value: 0,
  606. })
  607. if storage, ok := pv.Spec.Capacity[v1.ResourceStorage]; ok {
  608. scrapeResults = append(scrapeResults, metric.Update{
  609. Name: metric.KubePersistentVolumeCapacityBytes,
  610. Labels: pvInfo,
  611. Value: float64(storage.Value()),
  612. })
  613. }
  614. }
  615. events.Dispatch(event.ScrapeEvent{
  616. ScraperName: event.KubernetesClusterScraperName,
  617. ScrapeType: event.PvScraperType,
  618. Targets: len(pvs),
  619. Errors: nil,
  620. })
  621. return scrapeResults
  622. }
  623. func (ccs *ClusterCacheScraper) GetScrapeServices(services []*clustercache.Service) ScrapeFunc {
  624. return func() []metric.Update {
  625. return ccs.scrapeServices(services)
  626. }
  627. }
  628. func (ccs *ClusterCacheScraper) scrapeServices(services []*clustercache.Service) []metric.Update {
  629. var scrapeResults []metric.Update
  630. for _, service := range services {
  631. serviceInfo := map[string]string{
  632. source.UIDLabel: string(service.UID),
  633. source.ServiceLabel: service.Name,
  634. source.NamespaceLabel: service.Namespace,
  635. source.ServiceTypeLabel: string(service.Type),
  636. }
  637. scrapeResults = append(scrapeResults, metric.Update{
  638. Name: metric.ServiceInfo,
  639. Labels: serviceInfo,
  640. Value: 0,
  641. AdditionalInfo: serviceInfo,
  642. })
  643. // service selector labels
  644. selectorNames, selectorValues := promutil.KubeLabelsToLabels(service.SpecSelector)
  645. serviceLabels := util.ToMap(selectorNames, selectorValues)
  646. scrapeResults = append(scrapeResults, metric.Update{
  647. Name: metric.ServiceSelectorLabels,
  648. Labels: serviceInfo,
  649. Value: 0,
  650. AdditionalInfo: serviceLabels,
  651. })
  652. }
  653. events.Dispatch(event.ScrapeEvent{
  654. ScraperName: event.KubernetesClusterScraperName,
  655. ScrapeType: event.ServiceScraperType,
  656. Targets: len(services),
  657. Errors: nil,
  658. })
  659. return scrapeResults
  660. }
  661. func (ccs *ClusterCacheScraper) GetScrapeStatefulSets(statefulSets []*clustercache.StatefulSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  662. return func() []metric.Update {
  663. return ccs.scrapeStatefulSets(statefulSets, namespaceIndex)
  664. }
  665. }
  666. func (ccs *ClusterCacheScraper) scrapeStatefulSets(statefulSets []*clustercache.StatefulSet, namespaceIndex map[string]types.UID) []metric.Update {
  667. var scrapeResults []metric.Update
  668. for _, statefulSet := range statefulSets {
  669. nsUID, ok := namespaceIndex[statefulSet.Namespace]
  670. if !ok {
  671. log.Debugf("statefulSet namespaceUID missing from index for namespace name '%s'", statefulSet.Namespace)
  672. }
  673. statefulSetInfo := map[string]string{
  674. source.UIDLabel: string(statefulSet.UID),
  675. source.NamespaceUIDLabel: string(nsUID),
  676. source.StatefulSetLabel: statefulSet.Name,
  677. }
  678. // statefulSet info
  679. scrapeResults = append(scrapeResults, metric.Update{
  680. Name: metric.StatefulSetInfo,
  681. Labels: statefulSetInfo,
  682. Value: 0,
  683. AdditionalInfo: statefulSetInfo,
  684. })
  685. // statefulSet labels
  686. labelNames, labelValues := promutil.KubeLabelsToLabels(statefulSet.Labels)
  687. statefulSetLabels := util.ToMap(labelNames, labelValues)
  688. scrapeResults = append(scrapeResults, metric.Update{
  689. Name: metric.StatefulSetLabels,
  690. Labels: statefulSetInfo,
  691. Value: 0,
  692. AdditionalInfo: statefulSetLabels,
  693. })
  694. // statefulSet annotations
  695. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(statefulSet.Annotations)
  696. statefulSetAnnotations := util.ToMap(annotationNames, annotationValues)
  697. scrapeResults = append(scrapeResults, metric.Update{
  698. Name: metric.StatefulSetAnnotations,
  699. Labels: statefulSetInfo,
  700. Value: 0,
  701. AdditionalInfo: statefulSetAnnotations,
  702. })
  703. // statefulSet match labels
  704. statefulSetInfo[source.NamespaceLabel] = statefulSet.Namespace
  705. matchLabelNames, matchLabelValues := promutil.KubeLabelsToLabels(statefulSet.SpecSelector.MatchLabels)
  706. statefulSetMatchLabels := util.ToMap(matchLabelNames, matchLabelValues)
  707. scrapeResults = append(scrapeResults, metric.Update{
  708. Name: metric.StatefulSetMatchLabels,
  709. Labels: statefulSetInfo,
  710. Value: 0,
  711. AdditionalInfo: statefulSetMatchLabels,
  712. })
  713. }
  714. events.Dispatch(event.ScrapeEvent{
  715. ScraperName: event.KubernetesClusterScraperName,
  716. ScrapeType: event.StatefulSetScraperType,
  717. Targets: len(statefulSets),
  718. Errors: nil,
  719. })
  720. return scrapeResults
  721. }
  722. func (ccs *ClusterCacheScraper) GetScrapeDaemonSets(daemonSets []*clustercache.DaemonSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  723. return func() []metric.Update {
  724. return ccs.scrapeDaemonSets(daemonSets, namespaceIndex)
  725. }
  726. }
  727. func (ccs *ClusterCacheScraper) scrapeDaemonSets(daemonSets []*clustercache.DaemonSet, namespaceIndex map[string]types.UID) []metric.Update {
  728. var scrapeResults []metric.Update
  729. for _, daemonSet := range daemonSets {
  730. nsUID, ok := namespaceIndex[daemonSet.Namespace]
  731. if !ok {
  732. log.Debugf("daemonSet namespaceUID missing from index for namespace name '%s'", daemonSet.Namespace)
  733. }
  734. daemonSetInfo := map[string]string{
  735. source.UIDLabel: string(daemonSet.UID),
  736. source.NamespaceUIDLabel: string(nsUID),
  737. source.DaemonSetLabel: daemonSet.Name,
  738. }
  739. // daemonSet info
  740. scrapeResults = append(scrapeResults, metric.Update{
  741. Name: metric.DaemonSetInfo,
  742. Labels: daemonSetInfo,
  743. Value: 0,
  744. AdditionalInfo: daemonSetInfo,
  745. })
  746. // daemonSet labels
  747. labelNames, labelValues := promutil.KubeLabelsToLabels(daemonSet.Labels)
  748. daemonSetLabels := util.ToMap(labelNames, labelValues)
  749. scrapeResults = append(scrapeResults, metric.Update{
  750. Name: metric.DaemonSetLabels,
  751. Labels: daemonSetInfo,
  752. Value: 0,
  753. AdditionalInfo: daemonSetLabels,
  754. })
  755. // daemonSet annotations
  756. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(daemonSet.Annotations)
  757. daemonSetAnnotations := util.ToMap(annotationNames, annotationValues)
  758. scrapeResults = append(scrapeResults, metric.Update{
  759. Name: metric.DaemonSetAnnotations,
  760. Labels: daemonSetInfo,
  761. Value: 0,
  762. AdditionalInfo: daemonSetAnnotations,
  763. })
  764. }
  765. events.Dispatch(event.ScrapeEvent{
  766. ScraperName: event.KubernetesClusterScraperName,
  767. ScrapeType: event.DaemonSetScraperType,
  768. Targets: len(daemonSets),
  769. Errors: nil,
  770. })
  771. return scrapeResults
  772. }
  773. func (ccs *ClusterCacheScraper) GetScrapeJobs(jobs []*clustercache.Job, namespaceIndex map[string]types.UID) ScrapeFunc {
  774. return func() []metric.Update {
  775. return ccs.scrapeJobs(jobs, namespaceIndex)
  776. }
  777. }
  778. func (ccs *ClusterCacheScraper) scrapeJobs(jobs []*clustercache.Job, namespaceIndex map[string]types.UID) []metric.Update {
  779. var scrapeResults []metric.Update
  780. for _, job := range jobs {
  781. nsUID, ok := namespaceIndex[job.Namespace]
  782. if !ok {
  783. log.Debugf("job namespaceUID missing from index for namespace name '%s'", job.Namespace)
  784. }
  785. jobInfo := map[string]string{
  786. source.UIDLabel: string(job.UID),
  787. source.NamespaceUIDLabel: string(nsUID),
  788. source.JobLabel: job.Name,
  789. }
  790. // job info
  791. scrapeResults = append(scrapeResults, metric.Update{
  792. Name: metric.JobInfo,
  793. Labels: jobInfo,
  794. Value: 0,
  795. AdditionalInfo: jobInfo,
  796. })
  797. // job labels
  798. labelNames, labelValues := promutil.KubeLabelsToLabels(job.Labels)
  799. jobLabels := util.ToMap(labelNames, labelValues)
  800. scrapeResults = append(scrapeResults, metric.Update{
  801. Name: metric.JobLabels,
  802. Labels: jobInfo,
  803. Value: 0,
  804. AdditionalInfo: jobLabels,
  805. })
  806. // job annotations
  807. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(job.Annotations)
  808. jobAnnotations := util.ToMap(annotationNames, annotationValues)
  809. scrapeResults = append(scrapeResults, metric.Update{
  810. Name: metric.JobAnnotations,
  811. Labels: jobInfo,
  812. Value: 0,
  813. AdditionalInfo: jobAnnotations,
  814. })
  815. }
  816. events.Dispatch(event.ScrapeEvent{
  817. ScraperName: event.KubernetesClusterScraperName,
  818. ScrapeType: event.JobScraperType,
  819. Targets: len(jobs),
  820. Errors: nil,
  821. })
  822. return scrapeResults
  823. }
  824. func (ccs *ClusterCacheScraper) GetScrapeCronJobs(cronJobs []*clustercache.CronJob, namespaceIndex map[string]types.UID) ScrapeFunc {
  825. return func() []metric.Update {
  826. return ccs.scrapeCronJobs(cronJobs, namespaceIndex)
  827. }
  828. }
  829. func (ccs *ClusterCacheScraper) scrapeCronJobs(cronJobs []*clustercache.CronJob, namespaceIndex map[string]types.UID) []metric.Update {
  830. var scrapeResults []metric.Update
  831. for _, cronJob := range cronJobs {
  832. nsUID, ok := namespaceIndex[cronJob.Namespace]
  833. if !ok {
  834. log.Debugf("cronjob namespaceUID missing from index for namespace name '%s'", cronJob.Namespace)
  835. }
  836. cronJobInfo := map[string]string{
  837. source.UIDLabel: string(cronJob.UID),
  838. source.NamespaceUIDLabel: string(nsUID),
  839. source.CronJobLabel: cronJob.Name,
  840. }
  841. // cronjob info
  842. scrapeResults = append(scrapeResults, metric.Update{
  843. Name: metric.CronJobInfo,
  844. Labels: cronJobInfo,
  845. Value: 0,
  846. AdditionalInfo: cronJobInfo,
  847. })
  848. // cronjob labels
  849. labelNames, labelValues := promutil.KubeLabelsToLabels(cronJob.Labels)
  850. cronJobLabels := util.ToMap(labelNames, labelValues)
  851. scrapeResults = append(scrapeResults, metric.Update{
  852. Name: metric.CronJobLabels,
  853. Labels: cronJobInfo,
  854. Value: 0,
  855. AdditionalInfo: cronJobLabels,
  856. })
  857. // cronjob annotations
  858. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(cronJob.Annotations)
  859. cronJobAnnotations := util.ToMap(annotationNames, annotationValues)
  860. scrapeResults = append(scrapeResults, metric.Update{
  861. Name: metric.CronJobAnnotations,
  862. Labels: cronJobInfo,
  863. Value: 0,
  864. AdditionalInfo: cronJobAnnotations,
  865. })
  866. }
  867. events.Dispatch(event.ScrapeEvent{
  868. ScraperName: event.KubernetesClusterScraperName,
  869. ScrapeType: event.CronJobScraperType,
  870. Targets: len(cronJobs),
  871. Errors: nil,
  872. })
  873. return scrapeResults
  874. }
  875. func (ccs *ClusterCacheScraper) GetScrapeReplicaSets(replicaSets []*clustercache.ReplicaSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  876. return func() []metric.Update {
  877. return ccs.scrapeReplicaSets(replicaSets, namespaceIndex)
  878. }
  879. }
  880. func (ccs *ClusterCacheScraper) scrapeReplicaSets(replicaSets []*clustercache.ReplicaSet, namespaceIndex map[string]types.UID) []metric.Update {
  881. var scrapeResults []metric.Update
  882. for _, replicaSet := range replicaSets {
  883. nsUID, ok := namespaceIndex[replicaSet.Namespace]
  884. if !ok {
  885. log.Debugf("replicaset namespaceUID missing from index for namespace name '%s'", replicaSet.Namespace)
  886. }
  887. replicaSetInfo := map[string]string{
  888. source.UIDLabel: string(replicaSet.UID),
  889. source.NamespaceUIDLabel: string(nsUID),
  890. source.ReplicaSetLabel: replicaSet.Name,
  891. }
  892. // replicaset info
  893. scrapeResults = append(scrapeResults, metric.Update{
  894. Name: metric.ReplicaSetInfo,
  895. Labels: replicaSetInfo,
  896. Value: 0,
  897. AdditionalInfo: replicaSetInfo,
  898. })
  899. // replicaset labels
  900. labelNames, labelValues := promutil.KubeLabelsToLabels(replicaSet.Labels)
  901. replicaSetLabels := util.ToMap(labelNames, labelValues)
  902. scrapeResults = append(scrapeResults, metric.Update{
  903. Name: metric.ReplicaSetLabels,
  904. Labels: replicaSetInfo,
  905. Value: 0,
  906. AdditionalInfo: replicaSetLabels,
  907. })
  908. // replicaset annotations
  909. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(replicaSet.Annotations)
  910. replicaSetAnnotations := util.ToMap(annotationNames, annotationValues)
  911. scrapeResults = append(scrapeResults, metric.Update{
  912. Name: metric.ReplicaSetAnnotations,
  913. Labels: replicaSetInfo,
  914. Value: 0,
  915. AdditionalInfo: replicaSetAnnotations,
  916. })
  917. // owner references for backward compatibility
  918. replicaSetOwnerInfo := map[string]string{
  919. source.ReplicaSetLabel: replicaSet.Name,
  920. source.NamespaceLabel: replicaSet.Namespace,
  921. source.UIDLabel: string(replicaSet.UID),
  922. }
  923. // this specific metric exports a special <none> value for name and kind
  924. // if there are no owners
  925. if len(replicaSet.OwnerReferences) == 0 {
  926. ownerInfo := maps.Clone(replicaSetOwnerInfo)
  927. ownerInfo[source.OwnerKindLabel] = source.NoneLabelValue
  928. ownerInfo[source.OwnerNameLabel] = source.NoneLabelValue
  929. scrapeResults = append(scrapeResults, metric.Update{
  930. Name: metric.KubeReplicasetOwner,
  931. Labels: ownerInfo,
  932. Value: 0,
  933. })
  934. } else {
  935. for _, owner := range replicaSet.OwnerReferences {
  936. controller := "false"
  937. if owner.Controller != nil && *owner.Controller {
  938. controller = "true"
  939. }
  940. ownerInfo := maps.Clone(replicaSetOwnerInfo)
  941. ownerInfo[source.OwnerKindLabel] = owner.Kind
  942. ownerInfo[source.OwnerNameLabel] = owner.Name
  943. ownerInfo[source.OwnerUIDLabel] = string(owner.UID)
  944. ownerInfo[source.ControllerLabel] = controller
  945. scrapeResults = append(scrapeResults, metric.Update{
  946. Name: metric.KubeReplicasetOwner,
  947. Labels: ownerInfo,
  948. Value: 0,
  949. })
  950. }
  951. }
  952. }
  953. events.Dispatch(event.ScrapeEvent{
  954. ScraperName: event.KubernetesClusterScraperName,
  955. ScrapeType: event.ReplicaSetScraperType,
  956. Targets: len(replicaSets),
  957. Errors: nil,
  958. })
  959. return scrapeResults
  960. }
  961. func (ccs *ClusterCacheScraper) GetScrapeResourceQuotas(resourceQuotas []*clustercache.ResourceQuota, namespaceIndex map[string]types.UID) ScrapeFunc {
  962. return func() []metric.Update {
  963. return ccs.scrapeResourceQuotas(resourceQuotas, namespaceIndex)
  964. }
  965. }
  966. func (ccs *ClusterCacheScraper) scrapeResourceQuotas(resourceQuotas []*clustercache.ResourceQuota, namespaceIndex map[string]types.UID) []metric.Update {
  967. var scrapeResults []metric.Update
  968. processResource := func(baseLabels map[string]string, name v1.ResourceName, quantity resource.Quantity, metricName string) metric.Update {
  969. resource, unit, value := toResourceUnitValue(name, quantity)
  970. labels := maps.Clone(baseLabels)
  971. labels[source.ResourceLabel] = resource
  972. labels[source.UnitLabel] = unit
  973. return metric.Update{
  974. Name: metricName,
  975. Labels: labels,
  976. Value: value,
  977. }
  978. }
  979. for _, resourceQuota := range resourceQuotas {
  980. nsUID, _ := namespaceIndex[resourceQuota.Namespace]
  981. resourceQuotaInfo := map[string]string{
  982. source.UIDLabel: string(resourceQuota.UID),
  983. source.NamespaceUIDLabel: string(nsUID),
  984. source.ResourceQuotaLabel: resourceQuota.Name,
  985. }
  986. scrapeResults = append(scrapeResults, metric.Update{
  987. Name: metric.ResourceQuotaInfo,
  988. Labels: resourceQuotaInfo,
  989. AdditionalInfo: resourceQuotaInfo,
  990. Value: 0,
  991. })
  992. if resourceQuota.Spec.Hard != nil {
  993. // CPU/memory requests can also be aliased as "cpu" and "memory". For now, however, only scrape the complete names
  994. // https://kubernetes.io/docs/concepts/policy/resource-quotas/#compute-resource-quota
  995. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceRequestsCPU]; ok {
  996. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaSpecResourceRequests))
  997. }
  998. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceRequestsMemory]; ok {
  999. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaSpecResourceRequests))
  1000. }
  1001. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceLimitsCPU]; ok {
  1002. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaSpecResourceLimits))
  1003. }
  1004. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceLimitsMemory]; ok {
  1005. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaSpecResourceLimits))
  1006. }
  1007. }
  1008. if resourceQuota.Status.Used != nil {
  1009. if quantity, ok := resourceQuota.Status.Used[v1.ResourceRequestsCPU]; ok {
  1010. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaStatusUsedResourceRequests))
  1011. }
  1012. if quantity, ok := resourceQuota.Status.Used[v1.ResourceRequestsMemory]; ok {
  1013. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaStatusUsedResourceRequests))
  1014. }
  1015. if quantity, ok := resourceQuota.Status.Used[v1.ResourceLimitsCPU]; ok {
  1016. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaStatusUsedResourceLimits))
  1017. }
  1018. if quantity, ok := resourceQuota.Status.Used[v1.ResourceLimitsMemory]; ok {
  1019. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaStatusUsedResourceLimits))
  1020. }
  1021. }
  1022. }
  1023. events.Dispatch(event.ScrapeEvent{
  1024. ScraperName: event.KubernetesClusterScraperName,
  1025. ScrapeType: event.ResourceQuotaScraperType,
  1026. Targets: len(resourceQuotas),
  1027. Errors: nil,
  1028. })
  1029. return scrapeResults
  1030. }
  1031. // PvcInfo is used to store information about a pvc for tracking volume usage.
  1032. type PvcInfo struct {
  1033. Class string
  1034. Claim string
  1035. Namespace string
  1036. VolumeName string
  1037. Requests float64
  1038. PodsClaimed []string
  1039. }
  1040. func getPvcsInfo(pvcs []*clustercache.PersistentVolumeClaim) map[string]*PvcInfo {
  1041. toReturn := make(map[string]*PvcInfo)
  1042. for _, pvc := range pvcs {
  1043. ns := pvc.Namespace
  1044. pvcName := pvc.Name
  1045. volumeName := pvc.Spec.VolumeName
  1046. pvClass := getPersistentVolumeClaimClass(pvc)
  1047. requests := float64(pvc.Spec.Resources.Requests.Storage().Value())
  1048. key := ns + "," + pvcName
  1049. toReturn[key] = &PvcInfo{
  1050. Class: pvClass,
  1051. Claim: pvcName,
  1052. Namespace: ns,
  1053. VolumeName: volumeName,
  1054. Requests: requests,
  1055. }
  1056. }
  1057. return toReturn
  1058. }
  1059. // NodeGpuInfo contains the gpu count and vgpu counts for nodes
  1060. type NodeGpuInfo struct {
  1061. GPU float64
  1062. VGPU float64
  1063. }
  1064. func (ccs *ClusterCacheScraper) getNodesGpuInfo() map[string]*NodeGpuInfo {
  1065. // use a closure to cache allocatableVGPU result instead of calculating
  1066. // it every time we need it
  1067. var allocatableVGPUs *float64
  1068. allocVGPUs := func() (float64, error) {
  1069. if allocatableVGPUs != nil {
  1070. return *allocatableVGPUs, nil
  1071. }
  1072. vgpu, err := getAllocatableVGPUs(ccs.clusterCache.GetAllDaemonSets())
  1073. if err != nil {
  1074. return vgpu, err
  1075. }
  1076. allocatableVGPUs = &vgpu
  1077. return *allocatableVGPUs, nil
  1078. }
  1079. var nodeGpuMap map[string]*NodeGpuInfo = make(map[string]*NodeGpuInfo)
  1080. for _, node := range ccs.clusterCache.GetAllNodes() {
  1081. info, err := gpuInfoFor(node, allocVGPUs)
  1082. if err != nil {
  1083. log.Warnf("Failed to retrieve GPU Info for Node: %s - %s", node.Name, err)
  1084. continue
  1085. }
  1086. nodeGpuMap[node.Name] = info
  1087. }
  1088. return nodeGpuMap
  1089. }
  1090. // getPersistentVolumeClaimClass returns StorageClassName. If no storage class was
  1091. // requested, it returns "".
  1092. func getPersistentVolumeClaimClass(claim *clustercache.PersistentVolumeClaim) string {
  1093. // Use beta annotation first
  1094. if class, found := claim.Annotations[v1.BetaStorageClassAnnotation]; found {
  1095. return class
  1096. }
  1097. if claim.Spec.StorageClassName != nil {
  1098. return *claim.Spec.StorageClassName
  1099. }
  1100. // Special non-empty string to indicate absence of storage class.
  1101. return ""
  1102. }
  1103. // toResourceUnitValue accepts a resource name and quantity and returns the sanitized resource, the unit, and the value in the units.
  1104. // Returns an empty string for resource and unit if there was a failure.
  1105. func toResourceUnitValue(resourceName v1.ResourceName, quantity resource.Quantity) (resource string, unit string, value float64) {
  1106. resource = promutil.SanitizeLabelName(string(resourceName))
  1107. switch resourceName {
  1108. case v1.ResourceCPU:
  1109. unit = "core"
  1110. value = float64(quantity.MilliValue()) / 1000
  1111. return
  1112. case v1.ResourceStorage:
  1113. fallthrough
  1114. case v1.ResourceEphemeralStorage:
  1115. fallthrough
  1116. case v1.ResourceMemory:
  1117. unit = "byte"
  1118. value = float64(quantity.Value())
  1119. return
  1120. case v1.ResourcePods:
  1121. unit = "integer"
  1122. value = float64(quantity.Value())
  1123. return
  1124. default:
  1125. if isHugePageResourceName(resourceName) || isAttachableVolumeResourceName(resourceName) {
  1126. unit = "byte"
  1127. value = float64(quantity.Value())
  1128. return
  1129. }
  1130. if isExtendedResourceName(resourceName) {
  1131. unit = "integer"
  1132. value = float64(quantity.Value())
  1133. return
  1134. }
  1135. }
  1136. resource = ""
  1137. unit = ""
  1138. value = 0.0
  1139. return
  1140. }
  1141. func isGpuResourceName(name v1.ResourceName) bool {
  1142. return name == "nvidia.com/gpu" || name == "k8s.amazonaws.com/vgpu"
  1143. }
  1144. // isHugePageResourceName checks for a huge page container resource name
  1145. func isHugePageResourceName(name v1.ResourceName) bool {
  1146. return strings.HasPrefix(string(name), v1.ResourceHugePagesPrefix)
  1147. }
  1148. // isAttachableVolumeResourceName checks for attached volume container resource name
  1149. func isAttachableVolumeResourceName(name v1.ResourceName) bool {
  1150. return strings.HasPrefix(string(name), v1.ResourceAttachableVolumesPrefix)
  1151. }
  1152. // isExtendedResourceName checks for extended container resource name
  1153. func isExtendedResourceName(name v1.ResourceName) bool {
  1154. if isNativeResource(name) || strings.HasPrefix(string(name), v1.DefaultResourceRequestsPrefix) {
  1155. return false
  1156. }
  1157. // Ensure it satisfies the rules in IsQualifiedName() after converted into quota resource name
  1158. nameForQuota := fmt.Sprintf("%s%s", v1.DefaultResourceRequestsPrefix, string(name))
  1159. if errs := validation.IsQualifiedName(nameForQuota); len(errs) != 0 {
  1160. return false
  1161. }
  1162. return true
  1163. }
  1164. // isNativeResource checks for a kubernetes.io/ prefixed resource name
  1165. func isNativeResource(name v1.ResourceName) bool {
  1166. return !strings.Contains(string(name), "/") || isPrefixedNativeResource(name)
  1167. }
  1168. func isPrefixedNativeResource(name v1.ResourceName) bool {
  1169. return strings.Contains(string(name), v1.ResourceDefaultNamespacePrefix)
  1170. }
  1171. // gets the Node GPUs and VGPUs using the node data from k8s. Returns nil if GPUs could not be located for the node.
  1172. func gpuInfoFor(
  1173. n *clustercache.Node,
  1174. allocatedVGPUs func() (float64, error),
  1175. ) (*NodeGpuInfo, error) {
  1176. g, hasGpu := n.Status.Capacity["nvidia.com/gpu"]
  1177. _, hasReplicas := n.Labels["nvidia.com/gpu.replicas"]
  1178. // Case 1: Standard NVIDIA GPU
  1179. if hasGpu && g.Value() != 0 && !hasReplicas {
  1180. return &NodeGpuInfo{
  1181. GPU: float64(g.Value()),
  1182. VGPU: float64(g.Value()),
  1183. }, nil
  1184. }
  1185. // Case 2: NVIDIA GPU with GPU Feature Discovery (GFD) Pod enabled.
  1186. // Ref: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html#verifying-the-gpu-time-slicing-configuration
  1187. // Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L44-L45
  1188. // Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L103-L118
  1189. if hasReplicas {
  1190. resultGPU := 0.0
  1191. resultVGPU := 0.0
  1192. if c, ok := n.Labels["nvidia.com/gpu.count"]; ok {
  1193. var err error
  1194. resultGPU, err = strconv.ParseFloat(c, 64)
  1195. if err != nil {
  1196. return nil, fmt.Errorf("could not parse label \"nvidia.com/gpu.count\": %v", err)
  1197. }
  1198. }
  1199. if s, ok := n.Status.Capacity["nvidia.com/gpu.shared"]; ok { // GFD configured `renameByDefault=true`
  1200. resultVGPU = float64(s.Value())
  1201. } else if g, ok := n.Status.Capacity["nvidia.com/gpu"]; ok { // GFD configured `renameByDefault=false`
  1202. resultVGPU = float64(g.Value())
  1203. } else {
  1204. resultVGPU = resultGPU
  1205. }
  1206. return &NodeGpuInfo{
  1207. GPU: resultGPU,
  1208. VGPU: resultVGPU,
  1209. }, nil
  1210. }
  1211. // Case 3: AWS vGPU
  1212. if vgpu, ok := n.Status.Capacity["k8s.amazonaws.com/vgpu"]; ok {
  1213. vgpuCount, err := allocatedVGPUs()
  1214. if err != nil {
  1215. return nil, err
  1216. }
  1217. vgpuCoeff := 10.0
  1218. if vgpuCount > 0.0 {
  1219. vgpuCoeff = vgpuCount
  1220. }
  1221. if vgpu.Value() != 0 {
  1222. resultGPU := float64(vgpu.Value()) / vgpuCoeff
  1223. resultVGPU := float64(vgpu.Value())
  1224. return &NodeGpuInfo{
  1225. GPU: resultGPU,
  1226. VGPU: resultVGPU,
  1227. }, nil
  1228. }
  1229. }
  1230. // No GPU found
  1231. return nil, nil
  1232. }
  1233. func getAllocatableVGPUs(daemonsets []*clustercache.DaemonSet) (float64, error) {
  1234. vgpuCount := 0.0
  1235. for _, ds := range daemonsets {
  1236. dsContainerList := &ds.SpecContainers
  1237. for _, ctnr := range *dsContainerList {
  1238. if ctnr.Args != nil {
  1239. for _, arg := range ctnr.Args {
  1240. if strings.Contains(arg, "--vgpu=") {
  1241. vgpus, err := strconv.ParseFloat(arg[strings.IndexByte(arg, '=')+1:], 64)
  1242. if err != nil {
  1243. log.Errorf("failed to parse vgpu allocation string %s: %v", arg, err)
  1244. continue
  1245. }
  1246. vgpuCount = vgpus
  1247. return vgpuCount, nil
  1248. }
  1249. }
  1250. }
  1251. }
  1252. }
  1253. return vgpuCount, nil
  1254. }