clustercache.go 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472
  1. package scrape
  2. import (
  3. "fmt"
  4. "slices"
  5. "strconv"
  6. "strings"
  7. "github.com/kubecost/events"
  8. "github.com/opencost/opencost/core/pkg/clustercache"
  9. "github.com/opencost/opencost/core/pkg/log"
  10. "github.com/opencost/opencost/core/pkg/source"
  11. coreutil "github.com/opencost/opencost/core/pkg/util"
  12. "github.com/opencost/opencost/core/pkg/util/promutil"
  13. "github.com/opencost/opencost/modules/collector-source/pkg/event"
  14. "github.com/opencost/opencost/modules/collector-source/pkg/metric"
  15. "github.com/opencost/opencost/modules/collector-source/pkg/util"
  16. "golang.org/x/exp/maps"
  17. v1 "k8s.io/api/core/v1"
  18. "k8s.io/apimachinery/pkg/api/resource"
  19. "k8s.io/apimachinery/pkg/types"
  20. "k8s.io/apimachinery/pkg/util/validation"
  21. )
  22. const unmountedPVsContainer = "unmounted-pvs"
  23. type ClusterCacheScraper struct {
  24. clusterCache clustercache.ClusterCache
  25. }
  26. func newClusterCacheScraper(clusterCache clustercache.ClusterCache) Scraper {
  27. return &ClusterCacheScraper{
  28. clusterCache: clusterCache,
  29. }
  30. }
  31. func (ccs *ClusterCacheScraper) Scrape() []metric.Update {
  32. // retrieve objects for scrape
  33. nodes := ccs.clusterCache.GetAllNodes()
  34. deployments := ccs.clusterCache.GetAllDeployments()
  35. namespaces := ccs.clusterCache.GetAllNamespaces()
  36. pods := ccs.clusterCache.GetAllPods()
  37. pvcs := ccs.clusterCache.GetAllPersistentVolumeClaims()
  38. pvs := ccs.clusterCache.GetAllPersistentVolumes()
  39. services := ccs.clusterCache.GetAllServices()
  40. statefulSets := ccs.clusterCache.GetAllStatefulSets()
  41. daemonSets := ccs.clusterCache.GetAllDaemonSets()
  42. jobs := ccs.clusterCache.GetAllJobs()
  43. cronJobs := ccs.clusterCache.GetAllCronJobs()
  44. replicaSets := ccs.clusterCache.GetAllReplicaSets()
  45. resourceQuotas := ccs.clusterCache.GetAllResourceQuotas()
  46. // create scrape indexes. While the pairs being mapped here don't have a 1 to 1 relationship in the general case,
  47. // we are assuming that in the context of a single snapshot of the cluster they are 1 to 1.
  48. nodeNameToUID := buildNodeIndex(nodes)
  49. namespaceNameToUID := buildNamespaceIndex(namespaces)
  50. pvcNameToUID := buildPVCIndex(pvcs)
  51. pvNameToUID := buildPVIndex(pvs)
  52. scrapeFuncs := []ScrapeFunc{
  53. ccs.GetScrapeNodes(nodes),
  54. ccs.GetScrapeDeployments(deployments, namespaceNameToUID),
  55. ccs.GetScrapeNamespaces(namespaces),
  56. ccs.GetScrapePods(pods, pvcs, nodeNameToUID, namespaceNameToUID, pvcNameToUID),
  57. ccs.GetScrapePVCs(pvcs, namespaceNameToUID, pvNameToUID),
  58. ccs.GetScrapePVs(pvs),
  59. ccs.GetScrapeServices(services, namespaceNameToUID),
  60. ccs.GetScrapeStatefulSets(statefulSets, namespaceNameToUID),
  61. ccs.GetScrapeDaemonSets(daemonSets, namespaceNameToUID),
  62. ccs.GetScrapeJobs(jobs, namespaceNameToUID),
  63. ccs.GetScrapeCronJobs(cronJobs, namespaceNameToUID),
  64. ccs.GetScrapeReplicaSets(replicaSets, namespaceNameToUID),
  65. ccs.GetScrapeResourceQuotas(resourceQuotas, namespaceNameToUID),
  66. }
  67. return concurrentScrape(scrapeFuncs...)
  68. }
  69. func (ccs *ClusterCacheScraper) GetScrapeNodes(nodes []*clustercache.Node) ScrapeFunc {
  70. return func() []metric.Update {
  71. return ccs.scrapeNodes(nodes)
  72. }
  73. }
  74. func (ccs *ClusterCacheScraper) scrapeNodes(nodes []*clustercache.Node) []metric.Update {
  75. var scrapeResults []metric.Update
  76. for _, node := range nodes {
  77. nodeInfo := map[string]string{
  78. source.NodeLabel: node.Name,
  79. source.ProviderIDLabel: node.SpecProviderID,
  80. source.UIDLabel: string(node.UID),
  81. }
  82. if instanceType, ok := coreutil.GetInstanceType(node.Labels); ok {
  83. nodeInfo[source.InstanceTypeLabel] = instanceType
  84. }
  85. scrapeResults = append(scrapeResults, metric.Update{
  86. Name: metric.NodeInfo,
  87. Labels: nodeInfo,
  88. AdditionalInfo: nodeInfo,
  89. })
  90. // Node Capacity
  91. scrapeResults = scrapeResourceList(
  92. metric.NodeResourceCapacities,
  93. node.Status.Capacity,
  94. nodeInfo,
  95. scrapeResults)
  96. // This block and metric can be removed, when we stop exporting assets and allocations
  97. if node.Status.Capacity != nil {
  98. if quantity, ok := node.Status.Capacity[v1.ResourceCPU]; ok {
  99. _, _, value := toResourceUnitValue(v1.ResourceCPU, quantity)
  100. scrapeResults = append(scrapeResults, metric.Update{
  101. Name: metric.KubeNodeStatusCapacityCPUCores,
  102. Labels: nodeInfo,
  103. Value: value,
  104. })
  105. }
  106. if quantity, ok := node.Status.Capacity[v1.ResourceMemory]; ok {
  107. _, _, value := toResourceUnitValue(v1.ResourceMemory, quantity)
  108. scrapeResults = append(scrapeResults, metric.Update{
  109. Name: metric.KubeNodeStatusCapacityMemoryBytes,
  110. Labels: nodeInfo,
  111. Value: value,
  112. })
  113. }
  114. }
  115. // Node Allocatable Resources
  116. scrapeResults = scrapeResourceList(
  117. metric.NodeResourcesAllocatable,
  118. node.Status.Allocatable,
  119. nodeInfo,
  120. scrapeResults)
  121. // This block and metric can be removed, when we stop exporting assets and allocations
  122. if node.Status.Allocatable != nil {
  123. if quantity, ok := node.Status.Allocatable[v1.ResourceCPU]; ok {
  124. _, _, value := toResourceUnitValue(v1.ResourceCPU, quantity)
  125. scrapeResults = append(scrapeResults, metric.Update{
  126. Name: metric.KubeNodeStatusAllocatableCPUCores,
  127. Labels: nodeInfo,
  128. Value: value,
  129. })
  130. }
  131. if quantity, ok := node.Status.Allocatable[v1.ResourceMemory]; ok {
  132. _, _, value := toResourceUnitValue(v1.ResourceMemory, quantity)
  133. scrapeResults = append(scrapeResults, metric.Update{
  134. Name: metric.KubeNodeStatusAllocatableMemoryBytes,
  135. Labels: nodeInfo,
  136. Value: value,
  137. })
  138. }
  139. }
  140. // node labels
  141. labelNames, labelValues := promutil.KubeLabelsToLabels(node.Labels)
  142. nodeLabels := util.ToMap(labelNames, labelValues)
  143. scrapeResults = append(scrapeResults, metric.Update{
  144. Name: metric.KubeNodeLabels,
  145. Labels: nodeInfo,
  146. Value: 0,
  147. AdditionalInfo: nodeLabels,
  148. })
  149. }
  150. events.Dispatch(event.ScrapeEvent{
  151. ScraperName: event.KubernetesClusterScraperName,
  152. ScrapeType: event.NodeScraperType,
  153. Targets: len(nodes),
  154. Errors: nil,
  155. })
  156. return scrapeResults
  157. }
  158. func (ccs *ClusterCacheScraper) GetScrapeDeployments(deployments []*clustercache.Deployment, namespaceIndex map[string]types.UID) ScrapeFunc {
  159. return func() []metric.Update {
  160. return ccs.scrapeDeployments(deployments, namespaceIndex)
  161. }
  162. }
  163. func (ccs *ClusterCacheScraper) scrapeDeployments(deployments []*clustercache.Deployment, namespaceIndex map[string]types.UID) []metric.Update {
  164. var scrapeResults []metric.Update
  165. for _, deployment := range deployments {
  166. nsUID, ok := namespaceIndex[deployment.Namespace]
  167. if !ok {
  168. log.Debugf("deployment namespaceUID missing from index for namespace name '%s'", deployment.Namespace)
  169. }
  170. deploymentInfo := map[string]string{
  171. source.UIDLabel: string(deployment.UID),
  172. source.NamespaceUIDLabel: string(nsUID),
  173. source.NamespaceLabel: deployment.Namespace,
  174. source.DeploymentLabel: deployment.Name,
  175. }
  176. scrapeResults = append(scrapeResults, metric.Update{
  177. Name: metric.DeploymentInfo,
  178. Labels: deploymentInfo,
  179. Value: 0,
  180. AdditionalInfo: deploymentInfo,
  181. })
  182. // deployment labels
  183. labelNames, labelValues := promutil.KubeLabelsToLabels(deployment.Labels)
  184. deploymentLabels := util.ToMap(labelNames, labelValues)
  185. scrapeResults = append(scrapeResults, metric.Update{
  186. Name: metric.DeploymentLabels,
  187. Labels: deploymentInfo,
  188. Value: 0,
  189. AdditionalInfo: deploymentLabels,
  190. })
  191. // deployment annotations
  192. annoationNames, annotationValues := promutil.KubeAnnotationsToLabels(deployment.Annotations)
  193. deploymentAnnotations := util.ToMap(annoationNames, annotationValues)
  194. scrapeResults = append(scrapeResults, metric.Update{
  195. Name: metric.DeploymentAnnotations,
  196. Labels: deploymentInfo,
  197. Value: 0,
  198. AdditionalInfo: deploymentAnnotations,
  199. })
  200. // deployment match labels
  201. matchLabelNames, matchLabelValues := promutil.KubeLabelsToLabels(deployment.MatchLabels)
  202. deploymentMatchLabels := util.ToMap(matchLabelNames, matchLabelValues)
  203. scrapeResults = append(scrapeResults, metric.Update{
  204. Name: metric.DeploymentMatchLabels,
  205. Labels: deploymentInfo,
  206. Value: 0,
  207. AdditionalInfo: deploymentMatchLabels,
  208. })
  209. }
  210. events.Dispatch(event.ScrapeEvent{
  211. ScraperName: event.KubernetesClusterScraperName,
  212. ScrapeType: event.DeploymentScraperType,
  213. Targets: len(deployments),
  214. Errors: nil,
  215. })
  216. return scrapeResults
  217. }
  218. func (ccs *ClusterCacheScraper) GetScrapeNamespaces(namespaces []*clustercache.Namespace) ScrapeFunc {
  219. return func() []metric.Update {
  220. return ccs.scrapeNamespaces(namespaces)
  221. }
  222. }
  223. func (ccs *ClusterCacheScraper) scrapeNamespaces(namespaces []*clustercache.Namespace) []metric.Update {
  224. var scrapeResults []metric.Update
  225. for _, namespace := range namespaces {
  226. namespaceInfo := map[string]string{
  227. source.NamespaceLabel: namespace.Name,
  228. source.UIDLabel: string(namespace.UID),
  229. }
  230. scrapeResults = append(scrapeResults, metric.Update{
  231. Name: metric.NamespaceInfo,
  232. Labels: namespaceInfo,
  233. AdditionalInfo: namespaceInfo,
  234. Value: 0,
  235. })
  236. // namespace labels
  237. labelNames, labelValues := promutil.KubeLabelsToLabels(namespace.Labels)
  238. namespaceLabels := util.ToMap(labelNames, labelValues)
  239. scrapeResults = append(scrapeResults, metric.Update{
  240. Name: metric.KubeNamespaceLabels,
  241. Labels: namespaceInfo,
  242. Value: 0,
  243. AdditionalInfo: namespaceLabels,
  244. })
  245. // namespace annotations
  246. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(namespace.Annotations)
  247. namespaceAnnotations := util.ToMap(annotationNames, annotationValues)
  248. scrapeResults = append(scrapeResults, metric.Update{
  249. Name: metric.KubeNamespaceAnnotations,
  250. Labels: namespaceInfo,
  251. Value: 0,
  252. AdditionalInfo: namespaceAnnotations,
  253. })
  254. }
  255. events.Dispatch(event.ScrapeEvent{
  256. ScraperName: event.KubernetesClusterScraperName,
  257. ScrapeType: event.NamespaceScraperType,
  258. Targets: len(namespaces),
  259. Errors: nil,
  260. })
  261. return scrapeResults
  262. }
  263. func (ccs *ClusterCacheScraper) GetScrapePods(
  264. pods []*clustercache.Pod,
  265. pvcs []*clustercache.PersistentVolumeClaim,
  266. nodeIndex map[string]types.UID,
  267. namespaceIndex map[string]types.UID,
  268. pvcIndex map[pvcKey]types.UID,
  269. ) ScrapeFunc {
  270. return func() []metric.Update {
  271. return ccs.scrapePods(pods, pvcs, nodeIndex, namespaceIndex, pvcIndex)
  272. }
  273. }
  274. func (ccs *ClusterCacheScraper) scrapePods(
  275. pods []*clustercache.Pod,
  276. pvcs []*clustercache.PersistentVolumeClaim,
  277. nodeIndex map[string]types.UID,
  278. namespaceIndex map[string]types.UID,
  279. pvcIndex map[pvcKey]types.UID,
  280. ) []metric.Update {
  281. // this is only populated if we find gpu resources being requested
  282. var nodesGpuInfo map[string]*NodeGpuInfo
  283. // pv allocation and unmounted pvs
  284. pvcInfo := getPvcsInfo(pvcs)
  285. // pod info by uid
  286. podInfoByUid := make(map[string]map[string]string)
  287. var scrapeResults []metric.Update
  288. for _, pod := range pods {
  289. // pods without a set node name are not running
  290. if pod.Spec.NodeName == "" {
  291. continue
  292. }
  293. nodeUID, ok := nodeIndex[pod.Spec.NodeName]
  294. if !ok {
  295. log.Debugf("pod nodeUID missing from index for node name '%s'", pod.Spec.NodeName)
  296. }
  297. nsUID, ok := namespaceIndex[pod.Namespace]
  298. if !ok {
  299. log.Debugf("pod namespaceUID missing from index for namespace name '%s'", pod.Namespace)
  300. }
  301. podInfo := map[string]string{
  302. source.UIDLabel: string(pod.UID),
  303. source.PodLabel: pod.Name,
  304. source.NamespaceUIDLabel: string(nsUID),
  305. source.NodeUIDLabel: string(nodeUID),
  306. }
  307. scrapeResults = append(scrapeResults, metric.Update{
  308. Name: metric.PodInfo,
  309. Labels: podInfo,
  310. Value: 0,
  311. AdditionalInfo: podInfo,
  312. })
  313. podInfo[source.NamespaceLabel] = pod.Namespace
  314. podInfo[source.NodeLabel] = pod.Spec.NodeName
  315. podInfo[source.InstanceLabel] = pod.Spec.NodeName
  316. podInfoByUid[string(pod.UID)] = podInfo
  317. // pod labels
  318. labelNames, labelValues := promutil.KubeLabelsToLabels(pod.Labels)
  319. podLabels := util.ToMap(labelNames, labelValues)
  320. scrapeResults = append(scrapeResults, metric.Update{
  321. Name: metric.KubePodLabels,
  322. Labels: podInfo,
  323. Value: 0,
  324. AdditionalInfo: podLabels,
  325. })
  326. // pod annotations
  327. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(pod.Annotations)
  328. podAnnotations := util.ToMap(annotationNames, annotationValues)
  329. scrapeResults = append(scrapeResults, metric.Update{
  330. Name: metric.KubePodAnnotations,
  331. Labels: podInfo,
  332. Value: 0,
  333. AdditionalInfo: podAnnotations,
  334. })
  335. // Determine PVC use data for Pod
  336. claimed := make(map[string]struct{})
  337. for _, volume := range pod.Spec.Volumes {
  338. if volume.PersistentVolumeClaim != nil {
  339. name := volume.PersistentVolumeClaim.ClaimName
  340. key := pod.Namespace + "," + name
  341. if _, seen := claimed[key]; seen {
  342. continue
  343. }
  344. if pvc, ok := pvcInfo[key]; ok {
  345. pvc.PodsClaimed = append(pvc.PodsClaimed, string(pod.UID))
  346. claimed[key] = struct{}{}
  347. }
  348. }
  349. }
  350. // Pod owner metric
  351. for _, owner := range pod.OwnerReferences {
  352. controller := "false"
  353. if owner.Controller != nil && *owner.Controller {
  354. controller = "true"
  355. }
  356. ownerInfo := maps.Clone(podInfo)
  357. ownerInfo[source.OwnerKindLabel] = owner.Kind
  358. ownerInfo[source.OwnerNameLabel] = owner.Name
  359. ownerInfo[source.OwnerUIDLabel] = string(owner.UID)
  360. ownerInfo[source.ContainerLabel] = controller
  361. scrapeResults = append(scrapeResults, metric.Update{
  362. Name: metric.KubePodOwner,
  363. Labels: ownerInfo,
  364. Value: 0,
  365. })
  366. }
  367. // Container Status
  368. for _, status := range pod.Status.ContainerStatuses {
  369. if status.State.Running != nil {
  370. containerInfo := maps.Clone(podInfo)
  371. containerInfo[source.ContainerLabel] = status.Name
  372. scrapeResults = append(scrapeResults, metric.Update{
  373. Name: metric.KubePodContainerStatusRunning,
  374. Labels: containerInfo,
  375. AdditionalInfo: containerInfo,
  376. Value: 0,
  377. })
  378. }
  379. }
  380. for _, volume := range pod.Spec.Volumes {
  381. if volume.PersistentVolumeClaim != nil {
  382. pvcUID, ok := pvcIndex[pvcKey{
  383. name: volume.PersistentVolumeClaim.ClaimName,
  384. namespace: pod.Namespace,
  385. }]
  386. if !ok {
  387. continue
  388. }
  389. podPVCVolumeInfo := map[string]string{
  390. source.UIDLabel: string(pod.UID),
  391. source.PVCUIDLabel: string(pvcUID),
  392. source.PodVolumeNameLabel: volume.Name,
  393. }
  394. scrapeResults = append(scrapeResults, metric.Update{
  395. Name: metric.PodPVCVolume,
  396. Labels: podPVCVolumeInfo,
  397. Value: 0,
  398. })
  399. }
  400. }
  401. for _, container := range pod.Spec.Containers {
  402. containerInfo := maps.Clone(podInfo)
  403. containerInfo[source.ContainerLabel] = container.Name
  404. // Requests
  405. scrapeResults = scrapeResourceList(
  406. metric.KubePodContainerResourceRequests,
  407. container.Resources.Requests,
  408. containerInfo,
  409. scrapeResults)
  410. // Limits
  411. scrapeResults = scrapeResourceList(
  412. metric.KubePodContainerResourceLimits,
  413. container.Resources.Limits,
  414. containerInfo,
  415. scrapeResults)
  416. // Todo remove when asset/allocation pipeline are removed
  417. // gpu "requests" is either the request or limit if it exists
  418. var gpuRequest *float64
  419. for resourceName, quantity := range container.Resources.Requests {
  420. if isGpuResourceName(resourceName) {
  421. // set gpu request if it exists
  422. _, _, value := toResourceUnitValue(resourceName, quantity)
  423. gpuRequestValue := value
  424. gpuRequest = &gpuRequestValue
  425. break
  426. }
  427. }
  428. // Limits
  429. if gpuRequest == nil {
  430. for resourceName, quantity := range container.Resources.Limits {
  431. if isGpuResourceName(resourceName) {
  432. // set gpu request if it exists
  433. _, _, value := toResourceUnitValue(resourceName, quantity)
  434. gpuRequestValue := value
  435. gpuRequest = &gpuRequestValue
  436. break
  437. }
  438. }
  439. }
  440. // handle the GPU allocation metric here IFF there exists a request/limit for GPUs
  441. // we only load the node gpu data map if we run into a container with gpu requests/limits
  442. if gpuRequest != nil {
  443. if nodesGpuInfo == nil {
  444. nodesGpuInfo = ccs.getNodesGpuInfo()
  445. }
  446. gpuAlloc := *gpuRequest
  447. if nodeGpuInfo, ok := nodesGpuInfo[pod.Spec.NodeName]; ok {
  448. if nodeGpuInfo != nil && nodeGpuInfo.VGPU != 0 {
  449. gpuAlloc = gpuAlloc * (nodeGpuInfo.GPU / nodeGpuInfo.VGPU)
  450. }
  451. }
  452. scrapeResults = append(scrapeResults, metric.Update{
  453. Name: metric.ContainerGPUAllocation,
  454. Labels: maps.Clone(containerInfo),
  455. Value: gpuAlloc,
  456. })
  457. }
  458. }
  459. }
  460. // Iterate through PVC Info after the pods have been tallied and export
  461. // allocation metrics based on the number of other pods claiming the volume
  462. for _, pvc := range pvcInfo {
  463. // unmounted pvs get full allocation
  464. if len(pvc.PodsClaimed) == 0 {
  465. labels := map[string]string{
  466. source.PodLabel: unmountedPVsContainer,
  467. source.NamespaceLabel: pvc.Namespace,
  468. source.PVCLabel: pvc.Claim,
  469. source.PVLabel: pvc.VolumeName,
  470. }
  471. scrapeResults = append(scrapeResults, metric.Update{
  472. Name: metric.PodPVCAllocation,
  473. Labels: labels,
  474. Value: pvc.Requests,
  475. })
  476. continue
  477. }
  478. // pods get a proportion of pv allocation
  479. value := pvc.Requests / float64(len(pvc.PodsClaimed))
  480. for _, podUid := range pvc.PodsClaimed {
  481. podInfo, ok := podInfoByUid[podUid]
  482. if !ok {
  483. continue
  484. }
  485. pvcLabels := maps.Clone(podInfo)
  486. pvcLabels[source.PVCLabel] = pvc.Claim
  487. pvcLabels[source.PVLabel] = pvc.VolumeName
  488. scrapeResults = append(scrapeResults, metric.Update{
  489. Name: metric.PodPVCAllocation,
  490. Labels: pvcLabels,
  491. Value: value,
  492. })
  493. }
  494. }
  495. events.Dispatch(event.ScrapeEvent{
  496. ScraperName: event.KubernetesClusterScraperName,
  497. ScrapeType: event.PodScraperType,
  498. Targets: len(pods),
  499. Errors: nil,
  500. })
  501. return scrapeResults
  502. }
  503. func scrapeResourceList(metricName string, resourceList v1.ResourceList, baseLabels map[string]string, scrapeResults []metric.Update) []metric.Update {
  504. if resourceList != nil {
  505. // sorting keys here for testing purposes
  506. keys := maps.Keys(resourceList)
  507. slices.Sort(keys)
  508. for _, resourceName := range keys {
  509. quantity := resourceList[resourceName]
  510. resource, unit, value := toResourceUnitValue(resourceName, quantity)
  511. // failed to parse the resource type
  512. if resource == "" {
  513. log.DedupedWarningf(5, "Failed to parse resource units and quantity for resource: %s", resourceName)
  514. continue
  515. }
  516. resourceRequestInfo := maps.Clone(baseLabels)
  517. resourceRequestInfo[source.ResourceLabel] = resource
  518. resourceRequestInfo[source.UnitLabel] = unit
  519. scrapeResults = append(scrapeResults, metric.Update{
  520. Name: metricName,
  521. Labels: resourceRequestInfo,
  522. Value: value,
  523. })
  524. }
  525. }
  526. return scrapeResults
  527. }
  528. func (ccs *ClusterCacheScraper) GetScrapePVCs(
  529. pvcs []*clustercache.PersistentVolumeClaim,
  530. namespaceIndex map[string]types.UID,
  531. pvIndex map[string]types.UID,
  532. ) ScrapeFunc {
  533. return func() []metric.Update {
  534. return ccs.scrapePVCs(pvcs, namespaceIndex, pvIndex)
  535. }
  536. }
  537. func (ccs *ClusterCacheScraper) scrapePVCs(
  538. pvcs []*clustercache.PersistentVolumeClaim,
  539. namespaceIndex map[string]types.UID,
  540. pvIndex map[string]types.UID,
  541. ) []metric.Update {
  542. var scrapeResults []metric.Update
  543. for _, pvc := range pvcs {
  544. nsUID, ok := namespaceIndex[pvc.Namespace]
  545. if !ok {
  546. log.Debugf("pvc namespaceUID missing from index for namespace name '%s'", pvc.Namespace)
  547. }
  548. pvUID, ok := pvIndex[pvc.Spec.VolumeName]
  549. if !ok && pvc.Spec.VolumeName != "" {
  550. log.Debugf("pvc volume name missing from index for pv name '%s'", pvc.Spec.VolumeName)
  551. }
  552. pvcInfo := map[string]string{
  553. source.UIDLabel: string(pvc.UID),
  554. source.PVCLabel: pvc.Name,
  555. source.NamespaceUIDLabel: string(nsUID),
  556. source.NamespaceLabel: pvc.Namespace,
  557. source.VolumeNameLabel: pvc.Spec.VolumeName,
  558. source.PVUIDLabel: string(pvUID),
  559. source.StorageClassLabel: getPersistentVolumeClaimClass(pvc),
  560. }
  561. scrapeResults = append(scrapeResults, metric.Update{
  562. Name: metric.KubePersistentVolumeClaimInfo,
  563. Labels: pvcInfo,
  564. AdditionalInfo: pvcInfo,
  565. Value: 0,
  566. })
  567. if storage, ok := pvc.Spec.Resources.Requests[v1.ResourceStorage]; ok {
  568. scrapeResults = append(scrapeResults, metric.Update{
  569. Name: metric.KubePersistentVolumeClaimResourceRequestsStorageBytes,
  570. Labels: pvcInfo,
  571. Value: float64(storage.Value()),
  572. })
  573. }
  574. }
  575. events.Dispatch(event.ScrapeEvent{
  576. ScraperName: event.KubernetesClusterScraperName,
  577. ScrapeType: event.PvcScraperType,
  578. Targets: len(pvcs),
  579. Errors: nil,
  580. })
  581. return scrapeResults
  582. }
  583. func (ccs *ClusterCacheScraper) GetScrapePVs(pvs []*clustercache.PersistentVolume) ScrapeFunc {
  584. return func() []metric.Update {
  585. return ccs.scrapePVs(pvs)
  586. }
  587. }
  588. func (ccs *ClusterCacheScraper) scrapePVs(pvs []*clustercache.PersistentVolume) []metric.Update {
  589. var scrapeResults []metric.Update
  590. for _, pv := range pvs {
  591. providerID := pv.Name
  592. var csiVolumeHandle string
  593. // if a more accurate provider ID is available, use that
  594. if pv.Spec.CSI != nil && pv.Spec.CSI.VolumeHandle != "" {
  595. providerID = pv.Spec.CSI.VolumeHandle
  596. csiVolumeHandle = pv.Spec.CSI.VolumeHandle
  597. }
  598. pvInfo := map[string]string{
  599. source.UIDLabel: string(pv.UID),
  600. source.PVLabel: pv.Name,
  601. source.StorageClassLabel: pv.Spec.StorageClassName,
  602. source.ProviderIDLabel: providerID,
  603. source.CSIVolumeHandleLabel: csiVolumeHandle,
  604. }
  605. scrapeResults = append(scrapeResults, metric.Update{
  606. Name: metric.KubecostPVInfo,
  607. Labels: pvInfo,
  608. AdditionalInfo: pvInfo,
  609. Value: 0,
  610. })
  611. if storage, ok := pv.Spec.Capacity[v1.ResourceStorage]; ok {
  612. scrapeResults = append(scrapeResults, metric.Update{
  613. Name: metric.KubePersistentVolumeCapacityBytes,
  614. Labels: pvInfo,
  615. Value: float64(storage.Value()),
  616. })
  617. }
  618. }
  619. events.Dispatch(event.ScrapeEvent{
  620. ScraperName: event.KubernetesClusterScraperName,
  621. ScrapeType: event.PvScraperType,
  622. Targets: len(pvs),
  623. Errors: nil,
  624. })
  625. return scrapeResults
  626. }
  627. func (ccs *ClusterCacheScraper) GetScrapeServices(
  628. services []*clustercache.Service,
  629. namespaceIndex map[string]types.UID,
  630. ) ScrapeFunc {
  631. return func() []metric.Update {
  632. return ccs.scrapeServices(services, namespaceIndex)
  633. }
  634. }
  635. func (ccs *ClusterCacheScraper) scrapeServices(
  636. services []*clustercache.Service,
  637. namespaceIndex map[string]types.UID,
  638. ) []metric.Update {
  639. var scrapeResults []metric.Update
  640. for _, service := range services {
  641. namespaceUID := namespaceIndex[service.Namespace]
  642. // Assuming one address for now
  643. var lbIngressAddress string
  644. lbIngressAddresses := clustercache.GetLoadBalancerIngressAddress(service)
  645. if len(lbIngressAddresses) > 0 {
  646. lbIngressAddress = lbIngressAddresses[0]
  647. }
  648. serviceInfo := map[string]string{
  649. source.UIDLabel: string(service.UID),
  650. source.ServiceLabel: service.Name,
  651. source.NamespaceLabel: service.Namespace,
  652. source.NamespaceUIDLabel: string(namespaceUID),
  653. source.ServiceTypeLabel: string(service.Type),
  654. source.LBIngressAddress: lbIngressAddress,
  655. }
  656. scrapeResults = append(scrapeResults, metric.Update{
  657. Name: metric.ServiceInfo,
  658. Labels: serviceInfo,
  659. Value: 0,
  660. AdditionalInfo: serviceInfo,
  661. })
  662. // service selector labels
  663. selectorNames, selectorValues := promutil.KubeLabelsToLabels(service.SpecSelector)
  664. serviceLabels := util.ToMap(selectorNames, selectorValues)
  665. scrapeResults = append(scrapeResults, metric.Update{
  666. Name: metric.ServiceSelectorLabels,
  667. Labels: serviceInfo,
  668. Value: 0,
  669. AdditionalInfo: serviceLabels,
  670. })
  671. }
  672. events.Dispatch(event.ScrapeEvent{
  673. ScraperName: event.KubernetesClusterScraperName,
  674. ScrapeType: event.ServiceScraperType,
  675. Targets: len(services),
  676. Errors: nil,
  677. })
  678. return scrapeResults
  679. }
  680. func (ccs *ClusterCacheScraper) GetScrapeStatefulSets(statefulSets []*clustercache.StatefulSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  681. return func() []metric.Update {
  682. return ccs.scrapeStatefulSets(statefulSets, namespaceIndex)
  683. }
  684. }
  685. func (ccs *ClusterCacheScraper) scrapeStatefulSets(statefulSets []*clustercache.StatefulSet, namespaceIndex map[string]types.UID) []metric.Update {
  686. var scrapeResults []metric.Update
  687. for _, statefulSet := range statefulSets {
  688. nsUID, ok := namespaceIndex[statefulSet.Namespace]
  689. if !ok {
  690. log.Debugf("statefulSet namespaceUID missing from index for namespace name '%s'", statefulSet.Namespace)
  691. }
  692. statefulSetInfo := map[string]string{
  693. source.UIDLabel: string(statefulSet.UID),
  694. source.NamespaceUIDLabel: string(nsUID),
  695. source.StatefulSetLabel: statefulSet.Name,
  696. }
  697. // statefulSet info
  698. scrapeResults = append(scrapeResults, metric.Update{
  699. Name: metric.StatefulSetInfo,
  700. Labels: statefulSetInfo,
  701. Value: 0,
  702. AdditionalInfo: statefulSetInfo,
  703. })
  704. // statefulSet labels
  705. labelNames, labelValues := promutil.KubeLabelsToLabels(statefulSet.Labels)
  706. statefulSetLabels := util.ToMap(labelNames, labelValues)
  707. scrapeResults = append(scrapeResults, metric.Update{
  708. Name: metric.StatefulSetLabels,
  709. Labels: statefulSetInfo,
  710. Value: 0,
  711. AdditionalInfo: statefulSetLabels,
  712. })
  713. // statefulSet annotations
  714. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(statefulSet.Annotations)
  715. statefulSetAnnotations := util.ToMap(annotationNames, annotationValues)
  716. scrapeResults = append(scrapeResults, metric.Update{
  717. Name: metric.StatefulSetAnnotations,
  718. Labels: statefulSetInfo,
  719. Value: 0,
  720. AdditionalInfo: statefulSetAnnotations,
  721. })
  722. // statefulSet match labels
  723. statefulSetInfo[source.NamespaceLabel] = statefulSet.Namespace
  724. matchLabelNames, matchLabelValues := promutil.KubeLabelsToLabels(statefulSet.SpecSelector.MatchLabels)
  725. statefulSetMatchLabels := util.ToMap(matchLabelNames, matchLabelValues)
  726. scrapeResults = append(scrapeResults, metric.Update{
  727. Name: metric.StatefulSetMatchLabels,
  728. Labels: statefulSetInfo,
  729. Value: 0,
  730. AdditionalInfo: statefulSetMatchLabels,
  731. })
  732. }
  733. events.Dispatch(event.ScrapeEvent{
  734. ScraperName: event.KubernetesClusterScraperName,
  735. ScrapeType: event.StatefulSetScraperType,
  736. Targets: len(statefulSets),
  737. Errors: nil,
  738. })
  739. return scrapeResults
  740. }
  741. func (ccs *ClusterCacheScraper) GetScrapeDaemonSets(daemonSets []*clustercache.DaemonSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  742. return func() []metric.Update {
  743. return ccs.scrapeDaemonSets(daemonSets, namespaceIndex)
  744. }
  745. }
  746. func (ccs *ClusterCacheScraper) scrapeDaemonSets(daemonSets []*clustercache.DaemonSet, namespaceIndex map[string]types.UID) []metric.Update {
  747. var scrapeResults []metric.Update
  748. for _, daemonSet := range daemonSets {
  749. nsUID, ok := namespaceIndex[daemonSet.Namespace]
  750. if !ok {
  751. log.Debugf("daemonSet namespaceUID missing from index for namespace name '%s'", daemonSet.Namespace)
  752. }
  753. daemonSetInfo := map[string]string{
  754. source.UIDLabel: string(daemonSet.UID),
  755. source.NamespaceUIDLabel: string(nsUID),
  756. source.DaemonSetLabel: daemonSet.Name,
  757. }
  758. // daemonSet info
  759. scrapeResults = append(scrapeResults, metric.Update{
  760. Name: metric.DaemonSetInfo,
  761. Labels: daemonSetInfo,
  762. Value: 0,
  763. AdditionalInfo: daemonSetInfo,
  764. })
  765. // daemonSet labels
  766. labelNames, labelValues := promutil.KubeLabelsToLabels(daemonSet.Labels)
  767. daemonSetLabels := util.ToMap(labelNames, labelValues)
  768. scrapeResults = append(scrapeResults, metric.Update{
  769. Name: metric.DaemonSetLabels,
  770. Labels: daemonSetInfo,
  771. Value: 0,
  772. AdditionalInfo: daemonSetLabels,
  773. })
  774. // daemonSet annotations
  775. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(daemonSet.Annotations)
  776. daemonSetAnnotations := util.ToMap(annotationNames, annotationValues)
  777. scrapeResults = append(scrapeResults, metric.Update{
  778. Name: metric.DaemonSetAnnotations,
  779. Labels: daemonSetInfo,
  780. Value: 0,
  781. AdditionalInfo: daemonSetAnnotations,
  782. })
  783. }
  784. events.Dispatch(event.ScrapeEvent{
  785. ScraperName: event.KubernetesClusterScraperName,
  786. ScrapeType: event.DaemonSetScraperType,
  787. Targets: len(daemonSets),
  788. Errors: nil,
  789. })
  790. return scrapeResults
  791. }
  792. func (ccs *ClusterCacheScraper) GetScrapeJobs(jobs []*clustercache.Job, namespaceIndex map[string]types.UID) ScrapeFunc {
  793. return func() []metric.Update {
  794. return ccs.scrapeJobs(jobs, namespaceIndex)
  795. }
  796. }
  797. func (ccs *ClusterCacheScraper) scrapeJobs(jobs []*clustercache.Job, namespaceIndex map[string]types.UID) []metric.Update {
  798. var scrapeResults []metric.Update
  799. for _, job := range jobs {
  800. nsUID, ok := namespaceIndex[job.Namespace]
  801. if !ok {
  802. log.Debugf("job namespaceUID missing from index for namespace name '%s'", job.Namespace)
  803. }
  804. jobInfo := map[string]string{
  805. source.UIDLabel: string(job.UID),
  806. source.NamespaceUIDLabel: string(nsUID),
  807. source.JobLabel: job.Name,
  808. }
  809. // job info
  810. scrapeResults = append(scrapeResults, metric.Update{
  811. Name: metric.JobInfo,
  812. Labels: jobInfo,
  813. Value: 0,
  814. AdditionalInfo: jobInfo,
  815. })
  816. // job labels
  817. labelNames, labelValues := promutil.KubeLabelsToLabels(job.Labels)
  818. jobLabels := util.ToMap(labelNames, labelValues)
  819. scrapeResults = append(scrapeResults, metric.Update{
  820. Name: metric.JobLabels,
  821. Labels: jobInfo,
  822. Value: 0,
  823. AdditionalInfo: jobLabels,
  824. })
  825. // job annotations
  826. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(job.Annotations)
  827. jobAnnotations := util.ToMap(annotationNames, annotationValues)
  828. scrapeResults = append(scrapeResults, metric.Update{
  829. Name: metric.JobAnnotations,
  830. Labels: jobInfo,
  831. Value: 0,
  832. AdditionalInfo: jobAnnotations,
  833. })
  834. }
  835. events.Dispatch(event.ScrapeEvent{
  836. ScraperName: event.KubernetesClusterScraperName,
  837. ScrapeType: event.JobScraperType,
  838. Targets: len(jobs),
  839. Errors: nil,
  840. })
  841. return scrapeResults
  842. }
  843. func (ccs *ClusterCacheScraper) GetScrapeCronJobs(cronJobs []*clustercache.CronJob, namespaceIndex map[string]types.UID) ScrapeFunc {
  844. return func() []metric.Update {
  845. return ccs.scrapeCronJobs(cronJobs, namespaceIndex)
  846. }
  847. }
  848. func (ccs *ClusterCacheScraper) scrapeCronJobs(cronJobs []*clustercache.CronJob, namespaceIndex map[string]types.UID) []metric.Update {
  849. var scrapeResults []metric.Update
  850. for _, cronJob := range cronJobs {
  851. nsUID, ok := namespaceIndex[cronJob.Namespace]
  852. if !ok {
  853. log.Debugf("cronjob namespaceUID missing from index for namespace name '%s'", cronJob.Namespace)
  854. }
  855. cronJobInfo := map[string]string{
  856. source.UIDLabel: string(cronJob.UID),
  857. source.NamespaceUIDLabel: string(nsUID),
  858. source.CronJobLabel: cronJob.Name,
  859. }
  860. // cronjob info
  861. scrapeResults = append(scrapeResults, metric.Update{
  862. Name: metric.CronJobInfo,
  863. Labels: cronJobInfo,
  864. Value: 0,
  865. AdditionalInfo: cronJobInfo,
  866. })
  867. // cronjob labels
  868. labelNames, labelValues := promutil.KubeLabelsToLabels(cronJob.Labels)
  869. cronJobLabels := util.ToMap(labelNames, labelValues)
  870. scrapeResults = append(scrapeResults, metric.Update{
  871. Name: metric.CronJobLabels,
  872. Labels: cronJobInfo,
  873. Value: 0,
  874. AdditionalInfo: cronJobLabels,
  875. })
  876. // cronjob annotations
  877. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(cronJob.Annotations)
  878. cronJobAnnotations := util.ToMap(annotationNames, annotationValues)
  879. scrapeResults = append(scrapeResults, metric.Update{
  880. Name: metric.CronJobAnnotations,
  881. Labels: cronJobInfo,
  882. Value: 0,
  883. AdditionalInfo: cronJobAnnotations,
  884. })
  885. }
  886. events.Dispatch(event.ScrapeEvent{
  887. ScraperName: event.KubernetesClusterScraperName,
  888. ScrapeType: event.CronJobScraperType,
  889. Targets: len(cronJobs),
  890. Errors: nil,
  891. })
  892. return scrapeResults
  893. }
  894. func (ccs *ClusterCacheScraper) GetScrapeReplicaSets(replicaSets []*clustercache.ReplicaSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  895. return func() []metric.Update {
  896. return ccs.scrapeReplicaSets(replicaSets, namespaceIndex)
  897. }
  898. }
  899. func (ccs *ClusterCacheScraper) scrapeReplicaSets(replicaSets []*clustercache.ReplicaSet, namespaceIndex map[string]types.UID) []metric.Update {
  900. var scrapeResults []metric.Update
  901. for _, replicaSet := range replicaSets {
  902. nsUID, ok := namespaceIndex[replicaSet.Namespace]
  903. if !ok {
  904. log.Debugf("replicaset namespaceUID missing from index for namespace name '%s'", replicaSet.Namespace)
  905. }
  906. replicaSetInfo := map[string]string{
  907. source.UIDLabel: string(replicaSet.UID),
  908. source.NamespaceUIDLabel: string(nsUID),
  909. source.ReplicaSetLabel: replicaSet.Name,
  910. }
  911. // replicaset info
  912. scrapeResults = append(scrapeResults, metric.Update{
  913. Name: metric.ReplicaSetInfo,
  914. Labels: replicaSetInfo,
  915. Value: 0,
  916. AdditionalInfo: replicaSetInfo,
  917. })
  918. // replicaset labels
  919. labelNames, labelValues := promutil.KubeLabelsToLabels(replicaSet.Labels)
  920. replicaSetLabels := util.ToMap(labelNames, labelValues)
  921. scrapeResults = append(scrapeResults, metric.Update{
  922. Name: metric.ReplicaSetLabels,
  923. Labels: replicaSetInfo,
  924. Value: 0,
  925. AdditionalInfo: replicaSetLabels,
  926. })
  927. // replicaset annotations
  928. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(replicaSet.Annotations)
  929. replicaSetAnnotations := util.ToMap(annotationNames, annotationValues)
  930. scrapeResults = append(scrapeResults, metric.Update{
  931. Name: metric.ReplicaSetAnnotations,
  932. Labels: replicaSetInfo,
  933. Value: 0,
  934. AdditionalInfo: replicaSetAnnotations,
  935. })
  936. // owner references for backward compatibility
  937. replicaSetOwnerInfo := map[string]string{
  938. source.ReplicaSetLabel: replicaSet.Name,
  939. source.NamespaceLabel: replicaSet.Namespace,
  940. source.UIDLabel: string(replicaSet.UID),
  941. }
  942. // this specific metric exports a special <none> value for name and kind
  943. // if there are no owners
  944. if len(replicaSet.OwnerReferences) == 0 {
  945. ownerInfo := maps.Clone(replicaSetOwnerInfo)
  946. ownerInfo[source.OwnerKindLabel] = source.NoneLabelValue
  947. ownerInfo[source.OwnerNameLabel] = source.NoneLabelValue
  948. scrapeResults = append(scrapeResults, metric.Update{
  949. Name: metric.KubeReplicasetOwner,
  950. Labels: ownerInfo,
  951. Value: 0,
  952. })
  953. } else {
  954. for _, owner := range replicaSet.OwnerReferences {
  955. controller := "false"
  956. if owner.Controller != nil && *owner.Controller {
  957. controller = "true"
  958. }
  959. ownerInfo := maps.Clone(replicaSetOwnerInfo)
  960. ownerInfo[source.OwnerKindLabel] = owner.Kind
  961. ownerInfo[source.OwnerNameLabel] = owner.Name
  962. ownerInfo[source.OwnerUIDLabel] = string(owner.UID)
  963. ownerInfo[source.ControllerLabel] = controller
  964. scrapeResults = append(scrapeResults, metric.Update{
  965. Name: metric.KubeReplicasetOwner,
  966. Labels: ownerInfo,
  967. Value: 0,
  968. })
  969. }
  970. }
  971. }
  972. events.Dispatch(event.ScrapeEvent{
  973. ScraperName: event.KubernetesClusterScraperName,
  974. ScrapeType: event.ReplicaSetScraperType,
  975. Targets: len(replicaSets),
  976. Errors: nil,
  977. })
  978. return scrapeResults
  979. }
  980. func (ccs *ClusterCacheScraper) GetScrapeResourceQuotas(resourceQuotas []*clustercache.ResourceQuota, namespaceIndex map[string]types.UID) ScrapeFunc {
  981. return func() []metric.Update {
  982. return ccs.scrapeResourceQuotas(resourceQuotas, namespaceIndex)
  983. }
  984. }
  985. func (ccs *ClusterCacheScraper) scrapeResourceQuotas(resourceQuotas []*clustercache.ResourceQuota, namespaceIndex map[string]types.UID) []metric.Update {
  986. var scrapeResults []metric.Update
  987. processResource := func(baseLabels map[string]string, name v1.ResourceName, quantity resource.Quantity, metricName string) metric.Update {
  988. resource, unit, value := toResourceUnitValue(name, quantity)
  989. labels := maps.Clone(baseLabels)
  990. labels[source.ResourceLabel] = resource
  991. labels[source.UnitLabel] = unit
  992. return metric.Update{
  993. Name: metricName,
  994. Labels: labels,
  995. Value: value,
  996. }
  997. }
  998. for _, resourceQuota := range resourceQuotas {
  999. nsUID, _ := namespaceIndex[resourceQuota.Namespace]
  1000. resourceQuotaInfo := map[string]string{
  1001. source.UIDLabel: string(resourceQuota.UID),
  1002. source.NamespaceUIDLabel: string(nsUID),
  1003. source.ResourceQuotaLabel: resourceQuota.Name,
  1004. }
  1005. scrapeResults = append(scrapeResults, metric.Update{
  1006. Name: metric.ResourceQuotaInfo,
  1007. Labels: resourceQuotaInfo,
  1008. AdditionalInfo: resourceQuotaInfo,
  1009. Value: 0,
  1010. })
  1011. if resourceQuota.Spec.Hard != nil {
  1012. // CPU/memory requests can also be aliased as "cpu" and "memory". For now, however, only scrape the complete names
  1013. // https://kubernetes.io/docs/concepts/policy/resource-quotas/#compute-resource-quota
  1014. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceRequestsCPU]; ok {
  1015. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaSpecResourceRequests))
  1016. }
  1017. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceRequestsMemory]; ok {
  1018. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaSpecResourceRequests))
  1019. }
  1020. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceLimitsCPU]; ok {
  1021. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaSpecResourceLimits))
  1022. }
  1023. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceLimitsMemory]; ok {
  1024. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaSpecResourceLimits))
  1025. }
  1026. }
  1027. if resourceQuota.Status.Used != nil {
  1028. if quantity, ok := resourceQuota.Status.Used[v1.ResourceRequestsCPU]; ok {
  1029. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaStatusUsedResourceRequests))
  1030. }
  1031. if quantity, ok := resourceQuota.Status.Used[v1.ResourceRequestsMemory]; ok {
  1032. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaStatusUsedResourceRequests))
  1033. }
  1034. if quantity, ok := resourceQuota.Status.Used[v1.ResourceLimitsCPU]; ok {
  1035. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaStatusUsedResourceLimits))
  1036. }
  1037. if quantity, ok := resourceQuota.Status.Used[v1.ResourceLimitsMemory]; ok {
  1038. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaStatusUsedResourceLimits))
  1039. }
  1040. }
  1041. }
  1042. events.Dispatch(event.ScrapeEvent{
  1043. ScraperName: event.KubernetesClusterScraperName,
  1044. ScrapeType: event.ResourceQuotaScraperType,
  1045. Targets: len(resourceQuotas),
  1046. Errors: nil,
  1047. })
  1048. return scrapeResults
  1049. }
  1050. // PvcInfo is used to store information about a pvc for tracking volume usage.
  1051. type PvcInfo struct {
  1052. Class string
  1053. Claim string
  1054. Namespace string
  1055. VolumeName string
  1056. Requests float64
  1057. PodsClaimed []string
  1058. }
  1059. func getPvcsInfo(pvcs []*clustercache.PersistentVolumeClaim) map[string]*PvcInfo {
  1060. toReturn := make(map[string]*PvcInfo)
  1061. for _, pvc := range pvcs {
  1062. ns := pvc.Namespace
  1063. pvcName := pvc.Name
  1064. volumeName := pvc.Spec.VolumeName
  1065. pvClass := getPersistentVolumeClaimClass(pvc)
  1066. requests := float64(pvc.Spec.Resources.Requests.Storage().Value())
  1067. key := ns + "," + pvcName
  1068. toReturn[key] = &PvcInfo{
  1069. Class: pvClass,
  1070. Claim: pvcName,
  1071. Namespace: ns,
  1072. VolumeName: volumeName,
  1073. Requests: requests,
  1074. }
  1075. }
  1076. return toReturn
  1077. }
  1078. // NodeGpuInfo contains the gpu count and vgpu counts for nodes
  1079. type NodeGpuInfo struct {
  1080. GPU float64
  1081. VGPU float64
  1082. }
  1083. func (ccs *ClusterCacheScraper) getNodesGpuInfo() map[string]*NodeGpuInfo {
  1084. // use a closure to cache allocatableVGPU result instead of calculating
  1085. // it every time we need it
  1086. var allocatableVGPUs *float64
  1087. allocVGPUs := func() (float64, error) {
  1088. if allocatableVGPUs != nil {
  1089. return *allocatableVGPUs, nil
  1090. }
  1091. vgpu, err := getAllocatableVGPUs(ccs.clusterCache.GetAllDaemonSets())
  1092. if err != nil {
  1093. return vgpu, err
  1094. }
  1095. allocatableVGPUs = &vgpu
  1096. return *allocatableVGPUs, nil
  1097. }
  1098. var nodeGpuMap map[string]*NodeGpuInfo = make(map[string]*NodeGpuInfo)
  1099. for _, node := range ccs.clusterCache.GetAllNodes() {
  1100. info, err := gpuInfoFor(node, allocVGPUs)
  1101. if err != nil {
  1102. log.Warnf("Failed to retrieve GPU Info for Node: %s - %s", node.Name, err)
  1103. continue
  1104. }
  1105. nodeGpuMap[node.Name] = info
  1106. }
  1107. return nodeGpuMap
  1108. }
  1109. // getPersistentVolumeClaimClass returns StorageClassName. If no storage class was
  1110. // requested, it returns "".
  1111. func getPersistentVolumeClaimClass(claim *clustercache.PersistentVolumeClaim) string {
  1112. // Use beta annotation first
  1113. if class, found := claim.Annotations[v1.BetaStorageClassAnnotation]; found {
  1114. return class
  1115. }
  1116. if claim.Spec.StorageClassName != nil {
  1117. return *claim.Spec.StorageClassName
  1118. }
  1119. // Special non-empty string to indicate absence of storage class.
  1120. return ""
  1121. }
  1122. // toResourceUnitValue accepts a resource name and quantity and returns the sanitized resource, the unit, and the value in the units.
  1123. // Returns an empty string for resource and unit if there was a failure.
  1124. func toResourceUnitValue(resourceName v1.ResourceName, quantity resource.Quantity) (resource string, unit string, value float64) {
  1125. resource = promutil.SanitizeLabelName(string(resourceName))
  1126. switch resourceName {
  1127. case v1.ResourceCPU:
  1128. unit = "core"
  1129. value = float64(quantity.MilliValue()) / 1000
  1130. return
  1131. case v1.ResourceStorage:
  1132. fallthrough
  1133. case v1.ResourceEphemeralStorage:
  1134. fallthrough
  1135. case v1.ResourceMemory:
  1136. unit = "byte"
  1137. value = float64(quantity.Value())
  1138. return
  1139. case v1.ResourcePods:
  1140. unit = "integer"
  1141. value = float64(quantity.Value())
  1142. return
  1143. default:
  1144. if isHugePageResourceName(resourceName) || isAttachableVolumeResourceName(resourceName) {
  1145. unit = "byte"
  1146. value = float64(quantity.Value())
  1147. return
  1148. }
  1149. if isExtendedResourceName(resourceName) {
  1150. unit = "integer"
  1151. value = float64(quantity.Value())
  1152. return
  1153. }
  1154. }
  1155. resource = ""
  1156. unit = ""
  1157. value = 0.0
  1158. return
  1159. }
  1160. func isGpuResourceName(name v1.ResourceName) bool {
  1161. return name == "nvidia.com/gpu" || name == "k8s.amazonaws.com/vgpu"
  1162. }
  1163. // isHugePageResourceName checks for a huge page container resource name
  1164. func isHugePageResourceName(name v1.ResourceName) bool {
  1165. return strings.HasPrefix(string(name), v1.ResourceHugePagesPrefix)
  1166. }
  1167. // isAttachableVolumeResourceName checks for attached volume container resource name
  1168. func isAttachableVolumeResourceName(name v1.ResourceName) bool {
  1169. return strings.HasPrefix(string(name), v1.ResourceAttachableVolumesPrefix)
  1170. }
  1171. // isExtendedResourceName checks for extended container resource name
  1172. func isExtendedResourceName(name v1.ResourceName) bool {
  1173. if isNativeResource(name) || strings.HasPrefix(string(name), v1.DefaultResourceRequestsPrefix) {
  1174. return false
  1175. }
  1176. // Ensure it satisfies the rules in IsQualifiedName() after converted into quota resource name
  1177. nameForQuota := fmt.Sprintf("%s%s", v1.DefaultResourceRequestsPrefix, string(name))
  1178. if errs := validation.IsQualifiedName(nameForQuota); len(errs) != 0 {
  1179. return false
  1180. }
  1181. return true
  1182. }
  1183. // isNativeResource checks for a kubernetes.io/ prefixed resource name
  1184. func isNativeResource(name v1.ResourceName) bool {
  1185. return !strings.Contains(string(name), "/") || isPrefixedNativeResource(name)
  1186. }
  1187. func isPrefixedNativeResource(name v1.ResourceName) bool {
  1188. return strings.Contains(string(name), v1.ResourceDefaultNamespacePrefix)
  1189. }
  1190. // gets the Node GPUs and VGPUs using the node data from k8s. Returns nil if GPUs could not be located for the node.
  1191. func gpuInfoFor(
  1192. n *clustercache.Node,
  1193. allocatedVGPUs func() (float64, error),
  1194. ) (*NodeGpuInfo, error) {
  1195. g, hasGpu := n.Status.Capacity["nvidia.com/gpu"]
  1196. _, hasReplicas := n.Labels["nvidia.com/gpu.replicas"]
  1197. // Case 1: Standard NVIDIA GPU
  1198. if hasGpu && g.Value() != 0 && !hasReplicas {
  1199. return &NodeGpuInfo{
  1200. GPU: float64(g.Value()),
  1201. VGPU: float64(g.Value()),
  1202. }, nil
  1203. }
  1204. // Case 2: NVIDIA GPU with GPU Feature Discovery (GFD) Pod enabled.
  1205. // Ref: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html#verifying-the-gpu-time-slicing-configuration
  1206. // Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L44-L45
  1207. // Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L103-L118
  1208. if hasReplicas {
  1209. resultGPU := 0.0
  1210. resultVGPU := 0.0
  1211. if c, ok := n.Labels["nvidia.com/gpu.count"]; ok {
  1212. var err error
  1213. resultGPU, err = strconv.ParseFloat(c, 64)
  1214. if err != nil {
  1215. return nil, fmt.Errorf("could not parse label \"nvidia.com/gpu.count\": %v", err)
  1216. }
  1217. }
  1218. if s, ok := n.Status.Capacity["nvidia.com/gpu.shared"]; ok { // GFD configured `renameByDefault=true`
  1219. resultVGPU = float64(s.Value())
  1220. } else if g, ok := n.Status.Capacity["nvidia.com/gpu"]; ok { // GFD configured `renameByDefault=false`
  1221. resultVGPU = float64(g.Value())
  1222. } else {
  1223. resultVGPU = resultGPU
  1224. }
  1225. return &NodeGpuInfo{
  1226. GPU: resultGPU,
  1227. VGPU: resultVGPU,
  1228. }, nil
  1229. }
  1230. // Case 3: AWS vGPU
  1231. if vgpu, ok := n.Status.Capacity["k8s.amazonaws.com/vgpu"]; ok {
  1232. vgpuCount, err := allocatedVGPUs()
  1233. if err != nil {
  1234. return nil, err
  1235. }
  1236. vgpuCoeff := 10.0
  1237. if vgpuCount > 0.0 {
  1238. vgpuCoeff = vgpuCount
  1239. }
  1240. if vgpu.Value() != 0 {
  1241. resultGPU := float64(vgpu.Value()) / vgpuCoeff
  1242. resultVGPU := float64(vgpu.Value())
  1243. return &NodeGpuInfo{
  1244. GPU: resultGPU,
  1245. VGPU: resultVGPU,
  1246. }, nil
  1247. }
  1248. }
  1249. // No GPU found
  1250. return nil, nil
  1251. }
  1252. func getAllocatableVGPUs(daemonsets []*clustercache.DaemonSet) (float64, error) {
  1253. vgpuCount := 0.0
  1254. for _, ds := range daemonsets {
  1255. dsContainerList := &ds.SpecContainers
  1256. for _, ctnr := range *dsContainerList {
  1257. if ctnr.Args != nil {
  1258. for _, arg := range ctnr.Args {
  1259. if strings.Contains(arg, "--vgpu=") {
  1260. vgpus, err := strconv.ParseFloat(arg[strings.IndexByte(arg, '=')+1:], 64)
  1261. if err != nil {
  1262. log.Errorf("failed to parse vgpu allocation string %s: %v", arg, err)
  1263. continue
  1264. }
  1265. vgpuCount = vgpus
  1266. return vgpuCount, nil
  1267. }
  1268. }
  1269. }
  1270. }
  1271. }
  1272. return vgpuCount, nil
  1273. }