clustercache.go 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476
  1. package scrape
  2. import (
  3. "fmt"
  4. "slices"
  5. "strconv"
  6. "strings"
  7. "github.com/kubecost/events"
  8. "github.com/opencost/opencost/core/pkg/clustercache"
  9. "github.com/opencost/opencost/core/pkg/log"
  10. "github.com/opencost/opencost/core/pkg/source"
  11. coreutil "github.com/opencost/opencost/core/pkg/util"
  12. "github.com/opencost/opencost/core/pkg/util/promutil"
  13. "github.com/opencost/opencost/modules/collector-source/pkg/event"
  14. "github.com/opencost/opencost/modules/collector-source/pkg/metric"
  15. "github.com/opencost/opencost/modules/collector-source/pkg/util"
  16. "golang.org/x/exp/maps"
  17. v1 "k8s.io/api/core/v1"
  18. "k8s.io/apimachinery/pkg/api/resource"
  19. "k8s.io/apimachinery/pkg/types"
  20. "k8s.io/apimachinery/pkg/util/validation"
  21. )
  22. const unmountedPVsContainer = "unmounted-pvs"
  23. type ClusterCacheScraper struct {
  24. clusterCache clustercache.ClusterCache
  25. }
  26. func newClusterCacheScraper(clusterCache clustercache.ClusterCache) Scraper {
  27. return &ClusterCacheScraper{
  28. clusterCache: clusterCache,
  29. }
  30. }
  31. func (ccs *ClusterCacheScraper) Scrape() []metric.Update {
  32. // retrieve objects for scrape
  33. nodes := ccs.clusterCache.GetAllNodes()
  34. deployments := ccs.clusterCache.GetAllDeployments()
  35. namespaces := ccs.clusterCache.GetAllNamespaces()
  36. pods := ccs.clusterCache.GetAllPods()
  37. pvcs := ccs.clusterCache.GetAllPersistentVolumeClaims()
  38. pvs := ccs.clusterCache.GetAllPersistentVolumes()
  39. services := ccs.clusterCache.GetAllServices()
  40. statefulSets := ccs.clusterCache.GetAllStatefulSets()
  41. daemonSets := ccs.clusterCache.GetAllDaemonSets()
  42. jobs := ccs.clusterCache.GetAllJobs()
  43. cronJobs := ccs.clusterCache.GetAllCronJobs()
  44. replicaSets := ccs.clusterCache.GetAllReplicaSets()
  45. resourceQuotas := ccs.clusterCache.GetAllResourceQuotas()
  46. // create scrape indexes. While the pairs being mapped here don't have a 1 to 1 relationship in the general case,
  47. // we are assuming that in the context of a single snapshot of the cluster they are 1 to 1.
  48. nodeNameToUID := buildNodeIndex(nodes)
  49. namespaceNameToUID := buildNamespaceIndex(namespaces)
  50. pvcNameToUID := buildPVCIndex(pvcs)
  51. pvNameToUID := buildPVIndex(pvs)
  52. scrapeFuncs := []ScrapeFunc{
  53. ccs.GetScrapeNodes(nodes),
  54. ccs.GetScrapeDeployments(deployments, namespaceNameToUID),
  55. ccs.GetScrapeNamespaces(namespaces),
  56. ccs.GetScrapePods(pods, pvcs, nodeNameToUID, namespaceNameToUID, pvcNameToUID),
  57. ccs.GetScrapePVCs(pvcs, namespaceNameToUID, pvNameToUID),
  58. ccs.GetScrapePVs(pvs),
  59. ccs.GetScrapeServices(services, namespaceNameToUID),
  60. ccs.GetScrapeStatefulSets(statefulSets, namespaceNameToUID),
  61. ccs.GetScrapeDaemonSets(daemonSets, namespaceNameToUID),
  62. ccs.GetScrapeJobs(jobs, namespaceNameToUID),
  63. ccs.GetScrapeCronJobs(cronJobs, namespaceNameToUID),
  64. ccs.GetScrapeReplicaSets(replicaSets, namespaceNameToUID),
  65. ccs.GetScrapeResourceQuotas(resourceQuotas, namespaceNameToUID),
  66. }
  67. return concurrentScrape(scrapeFuncs...)
  68. }
  69. func (ccs *ClusterCacheScraper) GetScrapeNodes(nodes []*clustercache.Node) ScrapeFunc {
  70. return func() []metric.Update {
  71. return ccs.scrapeNodes(nodes)
  72. }
  73. }
  74. func (ccs *ClusterCacheScraper) scrapeNodes(nodes []*clustercache.Node) []metric.Update {
  75. var scrapeResults []metric.Update
  76. for _, node := range nodes {
  77. nodeInfo := map[string]string{
  78. source.NodeLabel: node.Name,
  79. source.ProviderIDLabel: node.SpecProviderID,
  80. source.UIDLabel: string(node.UID),
  81. }
  82. if instanceType, ok := coreutil.GetInstanceType(node.Labels); ok {
  83. nodeInfo[source.InstanceTypeLabel] = instanceType
  84. }
  85. scrapeResults = append(scrapeResults, metric.Update{
  86. Name: metric.NodeInfo,
  87. Labels: nodeInfo,
  88. AdditionalInfo: nodeInfo,
  89. })
  90. // Node Capacity
  91. scrapeResults = scrapeResourceList(
  92. metric.NodeResourceCapacities,
  93. node.Status.Capacity,
  94. nodeInfo,
  95. scrapeResults)
  96. // This block and metric can be removed, when we stop exporting assets and allocations
  97. if node.Status.Capacity != nil {
  98. if quantity, ok := node.Status.Capacity[v1.ResourceCPU]; ok {
  99. _, _, value := toResourceUnitValue(v1.ResourceCPU, quantity)
  100. scrapeResults = append(scrapeResults, metric.Update{
  101. Name: metric.KubeNodeStatusCapacityCPUCores,
  102. Labels: nodeInfo,
  103. Value: value,
  104. })
  105. }
  106. if quantity, ok := node.Status.Capacity[v1.ResourceMemory]; ok {
  107. _, _, value := toResourceUnitValue(v1.ResourceMemory, quantity)
  108. scrapeResults = append(scrapeResults, metric.Update{
  109. Name: metric.KubeNodeStatusCapacityMemoryBytes,
  110. Labels: nodeInfo,
  111. Value: value,
  112. })
  113. }
  114. }
  115. // Node Allocatable Resources
  116. scrapeResults = scrapeResourceList(
  117. metric.NodeResourcesAllocatable,
  118. node.Status.Allocatable,
  119. nodeInfo,
  120. scrapeResults)
  121. // This block and metric can be removed, when we stop exporting assets and allocations
  122. if node.Status.Allocatable != nil {
  123. if quantity, ok := node.Status.Allocatable[v1.ResourceCPU]; ok {
  124. _, _, value := toResourceUnitValue(v1.ResourceCPU, quantity)
  125. scrapeResults = append(scrapeResults, metric.Update{
  126. Name: metric.KubeNodeStatusAllocatableCPUCores,
  127. Labels: nodeInfo,
  128. Value: value,
  129. })
  130. }
  131. if quantity, ok := node.Status.Allocatable[v1.ResourceMemory]; ok {
  132. _, _, value := toResourceUnitValue(v1.ResourceMemory, quantity)
  133. scrapeResults = append(scrapeResults, metric.Update{
  134. Name: metric.KubeNodeStatusAllocatableMemoryBytes,
  135. Labels: nodeInfo,
  136. Value: value,
  137. })
  138. }
  139. }
  140. // node labels
  141. labelNames, labelValues := promutil.KubeLabelsToLabels(node.Labels)
  142. nodeLabels := util.ToMap(labelNames, labelValues)
  143. scrapeResults = append(scrapeResults, metric.Update{
  144. Name: metric.KubeNodeLabels,
  145. Labels: nodeInfo,
  146. Value: 0,
  147. AdditionalInfo: nodeLabels,
  148. })
  149. }
  150. events.Dispatch(event.ScrapeEvent{
  151. ScraperName: event.KubernetesClusterScraperName,
  152. ScrapeType: event.NodeScraperType,
  153. Targets: len(nodes),
  154. Errors: nil,
  155. })
  156. return scrapeResults
  157. }
  158. func (ccs *ClusterCacheScraper) GetScrapeDeployments(deployments []*clustercache.Deployment, namespaceIndex map[string]types.UID) ScrapeFunc {
  159. return func() []metric.Update {
  160. return ccs.scrapeDeployments(deployments, namespaceIndex)
  161. }
  162. }
  163. func (ccs *ClusterCacheScraper) scrapeDeployments(deployments []*clustercache.Deployment, namespaceIndex map[string]types.UID) []metric.Update {
  164. var scrapeResults []metric.Update
  165. for _, deployment := range deployments {
  166. nsUID, ok := namespaceIndex[deployment.Namespace]
  167. if !ok {
  168. log.Debugf("deployment namespaceUID missing from index for namespace name '%s'", deployment.Namespace)
  169. }
  170. deploymentInfo := map[string]string{
  171. source.UIDLabel: string(deployment.UID),
  172. source.NamespaceUIDLabel: string(nsUID),
  173. source.NamespaceLabel: deployment.Namespace,
  174. source.DeploymentLabel: deployment.Name,
  175. }
  176. scrapeResults = append(scrapeResults, metric.Update{
  177. Name: metric.DeploymentInfo,
  178. Labels: deploymentInfo,
  179. Value: 0,
  180. AdditionalInfo: deploymentInfo,
  181. })
  182. // deployment labels
  183. labelNames, labelValues := promutil.KubeLabelsToLabels(deployment.Labels)
  184. deploymentLabels := util.ToMap(labelNames, labelValues)
  185. scrapeResults = append(scrapeResults, metric.Update{
  186. Name: metric.DeploymentLabels,
  187. Labels: deploymentInfo,
  188. Value: 0,
  189. AdditionalInfo: deploymentLabels,
  190. })
  191. // deployment annotations
  192. annoationNames, annotationValues := promutil.KubeAnnotationsToLabels(deployment.Annotations)
  193. deploymentAnnotations := util.ToMap(annoationNames, annotationValues)
  194. scrapeResults = append(scrapeResults, metric.Update{
  195. Name: metric.DeploymentAnnotations,
  196. Labels: deploymentInfo,
  197. Value: 0,
  198. AdditionalInfo: deploymentAnnotations,
  199. })
  200. // deployment match labels
  201. matchLabelNames, matchLabelValues := promutil.KubeLabelsToLabels(deployment.MatchLabels)
  202. deploymentMatchLabels := util.ToMap(matchLabelNames, matchLabelValues)
  203. scrapeResults = append(scrapeResults, metric.Update{
  204. Name: metric.DeploymentMatchLabels,
  205. Labels: deploymentInfo,
  206. Value: 0,
  207. AdditionalInfo: deploymentMatchLabels,
  208. })
  209. }
  210. events.Dispatch(event.ScrapeEvent{
  211. ScraperName: event.KubernetesClusterScraperName,
  212. ScrapeType: event.DeploymentScraperType,
  213. Targets: len(deployments),
  214. Errors: nil,
  215. })
  216. return scrapeResults
  217. }
  218. func (ccs *ClusterCacheScraper) GetScrapeNamespaces(namespaces []*clustercache.Namespace) ScrapeFunc {
  219. return func() []metric.Update {
  220. return ccs.scrapeNamespaces(namespaces)
  221. }
  222. }
  223. func (ccs *ClusterCacheScraper) scrapeNamespaces(namespaces []*clustercache.Namespace) []metric.Update {
  224. var scrapeResults []metric.Update
  225. for _, namespace := range namespaces {
  226. namespaceInfo := map[string]string{
  227. source.NamespaceLabel: namespace.Name,
  228. source.UIDLabel: string(namespace.UID),
  229. }
  230. scrapeResults = append(scrapeResults, metric.Update{
  231. Name: metric.NamespaceInfo,
  232. Labels: namespaceInfo,
  233. AdditionalInfo: namespaceInfo,
  234. Value: 0,
  235. })
  236. // namespace labels
  237. labelNames, labelValues := promutil.KubeLabelsToLabels(namespace.Labels)
  238. namespaceLabels := util.ToMap(labelNames, labelValues)
  239. scrapeResults = append(scrapeResults, metric.Update{
  240. Name: metric.KubeNamespaceLabels,
  241. Labels: namespaceInfo,
  242. Value: 0,
  243. AdditionalInfo: namespaceLabels,
  244. })
  245. // namespace annotations
  246. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(namespace.Annotations)
  247. namespaceAnnotations := util.ToMap(annotationNames, annotationValues)
  248. scrapeResults = append(scrapeResults, metric.Update{
  249. Name: metric.KubeNamespaceAnnotations,
  250. Labels: namespaceInfo,
  251. Value: 0,
  252. AdditionalInfo: namespaceAnnotations,
  253. })
  254. }
  255. events.Dispatch(event.ScrapeEvent{
  256. ScraperName: event.KubernetesClusterScraperName,
  257. ScrapeType: event.NamespaceScraperType,
  258. Targets: len(namespaces),
  259. Errors: nil,
  260. })
  261. return scrapeResults
  262. }
  263. func (ccs *ClusterCacheScraper) GetScrapePods(
  264. pods []*clustercache.Pod,
  265. pvcs []*clustercache.PersistentVolumeClaim,
  266. nodeIndex map[string]types.UID,
  267. namespaceIndex map[string]types.UID,
  268. pvcIndex map[pvcKey]types.UID,
  269. ) ScrapeFunc {
  270. return func() []metric.Update {
  271. return ccs.scrapePods(pods, pvcs, nodeIndex, namespaceIndex, pvcIndex)
  272. }
  273. }
  274. func (ccs *ClusterCacheScraper) scrapePods(
  275. pods []*clustercache.Pod,
  276. pvcs []*clustercache.PersistentVolumeClaim,
  277. nodeIndex map[string]types.UID,
  278. namespaceIndex map[string]types.UID,
  279. pvcIndex map[pvcKey]types.UID,
  280. ) []metric.Update {
  281. // this is only populated if we find gpu resources being requested
  282. var nodesGpuInfo map[string]*NodeGpuInfo
  283. // pv allocation and unmounted pvs
  284. pvcInfo := getPvcsInfo(pvcs)
  285. // pod info by uid
  286. podInfoByUid := make(map[string]map[string]string)
  287. var scrapeResults []metric.Update
  288. for _, pod := range pods {
  289. // pods without a set node name are not running
  290. if pod.Spec.NodeName == "" {
  291. continue
  292. }
  293. nodeUID, ok := nodeIndex[pod.Spec.NodeName]
  294. if !ok {
  295. log.Debugf("pod nodeUID missing from index for node name '%s'", pod.Spec.NodeName)
  296. }
  297. nsUID, ok := namespaceIndex[pod.Namespace]
  298. if !ok {
  299. log.Debugf("pod namespaceUID missing from index for namespace name '%s'", pod.Namespace)
  300. }
  301. podInfo := map[string]string{
  302. source.UIDLabel: string(pod.UID),
  303. source.PodLabel: pod.Name,
  304. source.NamespaceUIDLabel: string(nsUID),
  305. source.NodeUIDLabel: string(nodeUID),
  306. }
  307. scrapeResults = append(scrapeResults, metric.Update{
  308. Name: metric.PodInfo,
  309. Labels: podInfo,
  310. Value: 0,
  311. AdditionalInfo: podInfo,
  312. })
  313. podInfo[source.NamespaceLabel] = pod.Namespace
  314. podInfo[source.NodeLabel] = pod.Spec.NodeName
  315. podInfo[source.InstanceLabel] = pod.Spec.NodeName
  316. podInfoByUid[string(pod.UID)] = podInfo
  317. // pod labels
  318. labelNames, labelValues := promutil.KubeLabelsToLabels(pod.Labels)
  319. podLabels := util.ToMap(labelNames, labelValues)
  320. scrapeResults = append(scrapeResults, metric.Update{
  321. Name: metric.KubePodLabels,
  322. Labels: podInfo,
  323. Value: 0,
  324. AdditionalInfo: podLabels,
  325. })
  326. // pod annotations
  327. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(pod.Annotations)
  328. podAnnotations := util.ToMap(annotationNames, annotationValues)
  329. scrapeResults = append(scrapeResults, metric.Update{
  330. Name: metric.KubePodAnnotations,
  331. Labels: podInfo,
  332. Value: 0,
  333. AdditionalInfo: podAnnotations,
  334. })
  335. // Determine PVC use data for Pod
  336. claimed := make(map[string]struct{})
  337. for _, volume := range pod.Spec.Volumes {
  338. if volume.PersistentVolumeClaim != nil {
  339. name := volume.PersistentVolumeClaim.ClaimName
  340. key := pod.Namespace + "," + name
  341. if _, seen := claimed[key]; seen {
  342. continue
  343. }
  344. if pvc, ok := pvcInfo[key]; ok {
  345. pvc.PodsClaimed = append(pvc.PodsClaimed, string(pod.UID))
  346. claimed[key] = struct{}{}
  347. }
  348. }
  349. }
  350. // Pod owner metric
  351. for _, owner := range pod.OwnerReferences {
  352. controller := "false"
  353. if owner.Controller != nil && *owner.Controller {
  354. controller = "true"
  355. }
  356. ownerInfo := maps.Clone(podInfo)
  357. ownerInfo[source.OwnerKindLabel] = owner.Kind
  358. ownerInfo[source.OwnerNameLabel] = owner.Name
  359. ownerInfo[source.OwnerUIDLabel] = string(owner.UID)
  360. ownerInfo[source.ControllerLabel] = controller
  361. scrapeResults = append(scrapeResults, metric.Update{
  362. Name: metric.KubePodOwner,
  363. Labels: ownerInfo,
  364. Value: 0,
  365. AdditionalInfo: ownerInfo,
  366. })
  367. }
  368. // Container Status
  369. for _, status := range pod.Status.ContainerStatuses {
  370. if status.State.Running != nil {
  371. containerInfo := maps.Clone(podInfo)
  372. containerInfo[source.ContainerLabel] = status.Name
  373. scrapeResults = append(scrapeResults, metric.Update{
  374. Name: metric.KubePodContainerStatusRunning,
  375. Labels: containerInfo,
  376. AdditionalInfo: containerInfo,
  377. Value: 0,
  378. })
  379. }
  380. }
  381. for _, volume := range pod.Spec.Volumes {
  382. if volume.PersistentVolumeClaim != nil {
  383. pvcUID, ok := pvcIndex[pvcKey{
  384. name: volume.PersistentVolumeClaim.ClaimName,
  385. namespace: pod.Namespace,
  386. }]
  387. if !ok {
  388. continue
  389. }
  390. podPVCVolumeInfo := map[string]string{
  391. source.UIDLabel: string(pod.UID),
  392. source.PVCUIDLabel: string(pvcUID),
  393. source.PodVolumeNameLabel: volume.Name,
  394. }
  395. scrapeResults = append(scrapeResults, metric.Update{
  396. Name: metric.PodPVCVolume,
  397. Labels: podPVCVolumeInfo,
  398. Value: 0,
  399. })
  400. }
  401. }
  402. for _, container := range pod.Spec.Containers {
  403. containerInfo := maps.Clone(podInfo)
  404. containerInfo[source.ContainerLabel] = container.Name
  405. // Requests
  406. scrapeResults = scrapeResourceList(
  407. metric.KubePodContainerResourceRequests,
  408. container.Resources.Requests,
  409. containerInfo,
  410. scrapeResults)
  411. // Limits
  412. scrapeResults = scrapeResourceList(
  413. metric.KubePodContainerResourceLimits,
  414. container.Resources.Limits,
  415. containerInfo,
  416. scrapeResults)
  417. // Todo remove when asset/allocation pipeline are removed
  418. // gpu "requests" is either the request or limit if it exists
  419. var gpuRequest *float64
  420. for resourceName, quantity := range container.Resources.Requests {
  421. if isGpuResourceName(resourceName) {
  422. // set gpu request if it exists
  423. _, _, value := toResourceUnitValue(resourceName, quantity)
  424. gpuRequestValue := value
  425. gpuRequest = &gpuRequestValue
  426. break
  427. }
  428. }
  429. // Limits
  430. if gpuRequest == nil {
  431. for resourceName, quantity := range container.Resources.Limits {
  432. if isGpuResourceName(resourceName) {
  433. // set gpu request if it exists
  434. _, _, value := toResourceUnitValue(resourceName, quantity)
  435. gpuRequestValue := value
  436. gpuRequest = &gpuRequestValue
  437. break
  438. }
  439. }
  440. }
  441. // handle the GPU allocation metric here IFF there exists a request/limit for GPUs
  442. // we only load the node gpu data map if we run into a container with gpu requests/limits
  443. if gpuRequest != nil {
  444. if nodesGpuInfo == nil {
  445. nodesGpuInfo = ccs.getNodesGpuInfo()
  446. }
  447. gpuAlloc := *gpuRequest
  448. if nodeGpuInfo, ok := nodesGpuInfo[pod.Spec.NodeName]; ok {
  449. if nodeGpuInfo != nil && nodeGpuInfo.VGPU != 0 {
  450. gpuAlloc = gpuAlloc * (nodeGpuInfo.GPU / nodeGpuInfo.VGPU)
  451. }
  452. }
  453. scrapeResults = append(scrapeResults, metric.Update{
  454. Name: metric.ContainerGPUAllocation,
  455. Labels: maps.Clone(containerInfo),
  456. Value: gpuAlloc,
  457. })
  458. }
  459. }
  460. }
  461. // Iterate through PVC Info after the pods have been tallied and export
  462. // allocation metrics based on the number of other pods claiming the volume
  463. for _, pvc := range pvcInfo {
  464. // unmounted pvs get full allocation
  465. if len(pvc.PodsClaimed) == 0 {
  466. labels := map[string]string{
  467. source.PodLabel: unmountedPVsContainer,
  468. source.NamespaceLabel: pvc.Namespace,
  469. source.PVCLabel: pvc.Claim,
  470. source.PVLabel: pvc.VolumeName,
  471. }
  472. scrapeResults = append(scrapeResults, metric.Update{
  473. Name: metric.PodPVCAllocation,
  474. Labels: labels,
  475. Value: pvc.Requests,
  476. })
  477. continue
  478. }
  479. // pods get a proportion of pv allocation
  480. value := pvc.Requests / float64(len(pvc.PodsClaimed))
  481. for _, podUid := range pvc.PodsClaimed {
  482. podInfo, ok := podInfoByUid[podUid]
  483. if !ok {
  484. continue
  485. }
  486. pvcLabels := maps.Clone(podInfo)
  487. pvcLabels[source.PVCLabel] = pvc.Claim
  488. pvcLabels[source.PVLabel] = pvc.VolumeName
  489. scrapeResults = append(scrapeResults, metric.Update{
  490. Name: metric.PodPVCAllocation,
  491. Labels: pvcLabels,
  492. Value: value,
  493. })
  494. }
  495. }
  496. events.Dispatch(event.ScrapeEvent{
  497. ScraperName: event.KubernetesClusterScraperName,
  498. ScrapeType: event.PodScraperType,
  499. Targets: len(pods),
  500. Errors: nil,
  501. })
  502. return scrapeResults
  503. }
  504. func scrapeResourceList(metricName string, resourceList v1.ResourceList, baseLabels map[string]string, scrapeResults []metric.Update) []metric.Update {
  505. if resourceList != nil {
  506. // sorting keys here for testing purposes
  507. keys := maps.Keys(resourceList)
  508. slices.Sort(keys)
  509. for _, resourceName := range keys {
  510. quantity := resourceList[resourceName]
  511. resource, unit, value := toResourceUnitValue(resourceName, quantity)
  512. // failed to parse the resource type
  513. if resource == "" {
  514. log.DedupedWarningf(5, "Failed to parse resource units and quantity for resource: %s", resourceName)
  515. continue
  516. }
  517. resourceRequestInfo := maps.Clone(baseLabels)
  518. resourceRequestInfo[source.ResourceLabel] = resource
  519. resourceRequestInfo[source.UnitLabel] = unit
  520. scrapeResults = append(scrapeResults, metric.Update{
  521. Name: metricName,
  522. Labels: resourceRequestInfo,
  523. Value: value,
  524. })
  525. }
  526. }
  527. return scrapeResults
  528. }
  529. func (ccs *ClusterCacheScraper) GetScrapePVCs(
  530. pvcs []*clustercache.PersistentVolumeClaim,
  531. namespaceIndex map[string]types.UID,
  532. pvIndex map[string]types.UID,
  533. ) ScrapeFunc {
  534. return func() []metric.Update {
  535. return ccs.scrapePVCs(pvcs, namespaceIndex, pvIndex)
  536. }
  537. }
  538. func (ccs *ClusterCacheScraper) scrapePVCs(
  539. pvcs []*clustercache.PersistentVolumeClaim,
  540. namespaceIndex map[string]types.UID,
  541. pvIndex map[string]types.UID,
  542. ) []metric.Update {
  543. var scrapeResults []metric.Update
  544. for _, pvc := range pvcs {
  545. nsUID, ok := namespaceIndex[pvc.Namespace]
  546. if !ok {
  547. log.Debugf("pvc namespaceUID missing from index for namespace name '%s'", pvc.Namespace)
  548. }
  549. pvUID, ok := pvIndex[pvc.Spec.VolumeName]
  550. if !ok && pvc.Spec.VolumeName != "" {
  551. log.Debugf("pvc volume name missing from index for pv name '%s'", pvc.Spec.VolumeName)
  552. }
  553. pvcInfo := map[string]string{
  554. source.UIDLabel: string(pvc.UID),
  555. source.PVCLabel: pvc.Name,
  556. source.NamespaceUIDLabel: string(nsUID),
  557. source.NamespaceLabel: pvc.Namespace,
  558. source.VolumeNameLabel: pvc.Spec.VolumeName,
  559. source.PVUIDLabel: string(pvUID),
  560. source.StorageClassLabel: getPersistentVolumeClaimClass(pvc),
  561. }
  562. scrapeResults = append(scrapeResults, metric.Update{
  563. Name: metric.KubePersistentVolumeClaimInfo,
  564. Labels: pvcInfo,
  565. AdditionalInfo: pvcInfo,
  566. Value: 0,
  567. })
  568. if storage, ok := pvc.Spec.Resources.Requests[v1.ResourceStorage]; ok {
  569. scrapeResults = append(scrapeResults, metric.Update{
  570. Name: metric.KubePersistentVolumeClaimResourceRequestsStorageBytes,
  571. Labels: pvcInfo,
  572. Value: float64(storage.Value()),
  573. })
  574. }
  575. }
  576. events.Dispatch(event.ScrapeEvent{
  577. ScraperName: event.KubernetesClusterScraperName,
  578. ScrapeType: event.PvcScraperType,
  579. Targets: len(pvcs),
  580. Errors: nil,
  581. })
  582. return scrapeResults
  583. }
  584. func (ccs *ClusterCacheScraper) GetScrapePVs(pvs []*clustercache.PersistentVolume) ScrapeFunc {
  585. return func() []metric.Update {
  586. return ccs.scrapePVs(pvs)
  587. }
  588. }
  589. func (ccs *ClusterCacheScraper) scrapePVs(pvs []*clustercache.PersistentVolume) []metric.Update {
  590. var scrapeResults []metric.Update
  591. for _, pv := range pvs {
  592. providerID := pv.Name
  593. var csiVolumeHandle string
  594. // if a more accurate provider ID is available, use that
  595. if pv.Spec.CSI != nil && pv.Spec.CSI.VolumeHandle != "" {
  596. providerID = pv.Spec.CSI.VolumeHandle
  597. csiVolumeHandle = pv.Spec.CSI.VolumeHandle
  598. }
  599. pvInfo := map[string]string{
  600. source.UIDLabel: string(pv.UID),
  601. source.PVLabel: pv.Name,
  602. source.StorageClassLabel: pv.Spec.StorageClassName,
  603. source.ProviderIDLabel: providerID,
  604. source.CSIVolumeHandleLabel: csiVolumeHandle,
  605. }
  606. scrapeResults = append(scrapeResults, metric.Update{
  607. Name: metric.KubecostPVInfo,
  608. Labels: pvInfo,
  609. AdditionalInfo: pvInfo,
  610. Value: 0,
  611. })
  612. if storage, ok := pv.Spec.Capacity[v1.ResourceStorage]; ok {
  613. scrapeResults = append(scrapeResults, metric.Update{
  614. Name: metric.KubePersistentVolumeCapacityBytes,
  615. Labels: pvInfo,
  616. Value: float64(storage.Value()),
  617. })
  618. }
  619. }
  620. events.Dispatch(event.ScrapeEvent{
  621. ScraperName: event.KubernetesClusterScraperName,
  622. ScrapeType: event.PvScraperType,
  623. Targets: len(pvs),
  624. Errors: nil,
  625. })
  626. return scrapeResults
  627. }
  628. func (ccs *ClusterCacheScraper) GetScrapeServices(
  629. services []*clustercache.Service,
  630. namespaceIndex map[string]types.UID,
  631. ) ScrapeFunc {
  632. return func() []metric.Update {
  633. return ccs.scrapeServices(services, namespaceIndex)
  634. }
  635. }
  636. func (ccs *ClusterCacheScraper) scrapeServices(
  637. services []*clustercache.Service,
  638. namespaceIndex map[string]types.UID,
  639. ) []metric.Update {
  640. var scrapeResults []metric.Update
  641. for _, service := range services {
  642. namespaceUID := namespaceIndex[service.Namespace]
  643. // Assuming one address for now
  644. var lbIngressAddress string
  645. lbIngressAddresses := clustercache.GetLoadBalancerIngressAddress(service)
  646. if len(lbIngressAddresses) > 0 {
  647. lbIngressAddress = lbIngressAddresses[0]
  648. }
  649. serviceInfo := map[string]string{
  650. source.UIDLabel: string(service.UID),
  651. source.ServiceLabel: service.Name,
  652. source.NamespaceLabel: service.Namespace,
  653. source.NamespaceUIDLabel: string(namespaceUID),
  654. source.ServiceTypeLabel: string(service.Type),
  655. source.LBIngressAddress: lbIngressAddress,
  656. }
  657. scrapeResults = append(scrapeResults, metric.Update{
  658. Name: metric.ServiceInfo,
  659. Labels: serviceInfo,
  660. Value: 0,
  661. AdditionalInfo: serviceInfo,
  662. })
  663. // service selector labels
  664. selectorNames, selectorValues := promutil.KubeLabelsToLabels(service.SpecSelector)
  665. serviceLabels := util.ToMap(selectorNames, selectorValues)
  666. scrapeResults = append(scrapeResults, metric.Update{
  667. Name: metric.ServiceSelectorLabels,
  668. Labels: serviceInfo,
  669. Value: 0,
  670. AdditionalInfo: serviceLabels,
  671. })
  672. }
  673. events.Dispatch(event.ScrapeEvent{
  674. ScraperName: event.KubernetesClusterScraperName,
  675. ScrapeType: event.ServiceScraperType,
  676. Targets: len(services),
  677. Errors: nil,
  678. })
  679. return scrapeResults
  680. }
  681. func (ccs *ClusterCacheScraper) GetScrapeStatefulSets(statefulSets []*clustercache.StatefulSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  682. return func() []metric.Update {
  683. return ccs.scrapeStatefulSets(statefulSets, namespaceIndex)
  684. }
  685. }
  686. func (ccs *ClusterCacheScraper) scrapeStatefulSets(statefulSets []*clustercache.StatefulSet, namespaceIndex map[string]types.UID) []metric.Update {
  687. var scrapeResults []metric.Update
  688. for _, statefulSet := range statefulSets {
  689. nsUID, ok := namespaceIndex[statefulSet.Namespace]
  690. if !ok {
  691. log.Debugf("statefulSet namespaceUID missing from index for namespace name '%s'", statefulSet.Namespace)
  692. }
  693. statefulSetInfo := map[string]string{
  694. source.UIDLabel: string(statefulSet.UID),
  695. source.NamespaceUIDLabel: string(nsUID),
  696. source.StatefulSetLabel: statefulSet.Name,
  697. }
  698. // statefulSet info
  699. scrapeResults = append(scrapeResults, metric.Update{
  700. Name: metric.StatefulSetInfo,
  701. Labels: statefulSetInfo,
  702. Value: 0,
  703. AdditionalInfo: statefulSetInfo,
  704. })
  705. // statefulSet labels
  706. labelNames, labelValues := promutil.KubeLabelsToLabels(statefulSet.Labels)
  707. statefulSetLabels := util.ToMap(labelNames, labelValues)
  708. scrapeResults = append(scrapeResults, metric.Update{
  709. Name: metric.StatefulSetLabels,
  710. Labels: statefulSetInfo,
  711. Value: 0,
  712. AdditionalInfo: statefulSetLabels,
  713. })
  714. // statefulSet annotations
  715. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(statefulSet.Annotations)
  716. statefulSetAnnotations := util.ToMap(annotationNames, annotationValues)
  717. scrapeResults = append(scrapeResults, metric.Update{
  718. Name: metric.StatefulSetAnnotations,
  719. Labels: statefulSetInfo,
  720. Value: 0,
  721. AdditionalInfo: statefulSetAnnotations,
  722. })
  723. // statefulSet match labels
  724. statefulSetInfo[source.NamespaceLabel] = statefulSet.Namespace
  725. matchLabelNames, matchLabelValues := promutil.KubeLabelsToLabels(statefulSet.SpecSelector.MatchLabels)
  726. statefulSetMatchLabels := util.ToMap(matchLabelNames, matchLabelValues)
  727. scrapeResults = append(scrapeResults, metric.Update{
  728. Name: metric.StatefulSetMatchLabels,
  729. Labels: statefulSetInfo,
  730. Value: 0,
  731. AdditionalInfo: statefulSetMatchLabels,
  732. })
  733. }
  734. events.Dispatch(event.ScrapeEvent{
  735. ScraperName: event.KubernetesClusterScraperName,
  736. ScrapeType: event.StatefulSetScraperType,
  737. Targets: len(statefulSets),
  738. Errors: nil,
  739. })
  740. return scrapeResults
  741. }
  742. func (ccs *ClusterCacheScraper) GetScrapeDaemonSets(daemonSets []*clustercache.DaemonSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  743. return func() []metric.Update {
  744. return ccs.scrapeDaemonSets(daemonSets, namespaceIndex)
  745. }
  746. }
  747. func (ccs *ClusterCacheScraper) scrapeDaemonSets(daemonSets []*clustercache.DaemonSet, namespaceIndex map[string]types.UID) []metric.Update {
  748. var scrapeResults []metric.Update
  749. for _, daemonSet := range daemonSets {
  750. nsUID, ok := namespaceIndex[daemonSet.Namespace]
  751. if !ok {
  752. log.Debugf("daemonSet namespaceUID missing from index for namespace name '%s'", daemonSet.Namespace)
  753. }
  754. daemonSetInfo := map[string]string{
  755. source.UIDLabel: string(daemonSet.UID),
  756. source.NamespaceUIDLabel: string(nsUID),
  757. source.DaemonSetLabel: daemonSet.Name,
  758. }
  759. // daemonSet info
  760. scrapeResults = append(scrapeResults, metric.Update{
  761. Name: metric.DaemonSetInfo,
  762. Labels: daemonSetInfo,
  763. Value: 0,
  764. AdditionalInfo: daemonSetInfo,
  765. })
  766. // daemonSet labels
  767. labelNames, labelValues := promutil.KubeLabelsToLabels(daemonSet.Labels)
  768. daemonSetLabels := util.ToMap(labelNames, labelValues)
  769. scrapeResults = append(scrapeResults, metric.Update{
  770. Name: metric.DaemonSetLabels,
  771. Labels: daemonSetInfo,
  772. Value: 0,
  773. AdditionalInfo: daemonSetLabels,
  774. })
  775. // daemonSet annotations
  776. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(daemonSet.Annotations)
  777. daemonSetAnnotations := util.ToMap(annotationNames, annotationValues)
  778. scrapeResults = append(scrapeResults, metric.Update{
  779. Name: metric.DaemonSetAnnotations,
  780. Labels: daemonSetInfo,
  781. Value: 0,
  782. AdditionalInfo: daemonSetAnnotations,
  783. })
  784. }
  785. events.Dispatch(event.ScrapeEvent{
  786. ScraperName: event.KubernetesClusterScraperName,
  787. ScrapeType: event.DaemonSetScraperType,
  788. Targets: len(daemonSets),
  789. Errors: nil,
  790. })
  791. return scrapeResults
  792. }
  793. func (ccs *ClusterCacheScraper) GetScrapeJobs(jobs []*clustercache.Job, namespaceIndex map[string]types.UID) ScrapeFunc {
  794. return func() []metric.Update {
  795. return ccs.scrapeJobs(jobs, namespaceIndex)
  796. }
  797. }
  798. func (ccs *ClusterCacheScraper) scrapeJobs(jobs []*clustercache.Job, namespaceIndex map[string]types.UID) []metric.Update {
  799. var scrapeResults []metric.Update
  800. for _, job := range jobs {
  801. nsUID, ok := namespaceIndex[job.Namespace]
  802. if !ok {
  803. log.Debugf("job namespaceUID missing from index for namespace name '%s'", job.Namespace)
  804. }
  805. jobInfo := map[string]string{
  806. source.UIDLabel: string(job.UID),
  807. source.NamespaceUIDLabel: string(nsUID),
  808. source.JobLabel: job.Name,
  809. }
  810. // job info
  811. scrapeResults = append(scrapeResults, metric.Update{
  812. Name: metric.JobInfo,
  813. Labels: jobInfo,
  814. Value: 0,
  815. AdditionalInfo: jobInfo,
  816. })
  817. // job labels
  818. labelNames, labelValues := promutil.KubeLabelsToLabels(job.Labels)
  819. jobLabels := util.ToMap(labelNames, labelValues)
  820. scrapeResults = append(scrapeResults, metric.Update{
  821. Name: metric.JobLabels,
  822. Labels: jobInfo,
  823. Value: 0,
  824. AdditionalInfo: jobLabels,
  825. })
  826. // job annotations
  827. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(job.Annotations)
  828. jobAnnotations := util.ToMap(annotationNames, annotationValues)
  829. scrapeResults = append(scrapeResults, metric.Update{
  830. Name: metric.JobAnnotations,
  831. Labels: jobInfo,
  832. Value: 0,
  833. AdditionalInfo: jobAnnotations,
  834. })
  835. }
  836. events.Dispatch(event.ScrapeEvent{
  837. ScraperName: event.KubernetesClusterScraperName,
  838. ScrapeType: event.JobScraperType,
  839. Targets: len(jobs),
  840. Errors: nil,
  841. })
  842. return scrapeResults
  843. }
  844. func (ccs *ClusterCacheScraper) GetScrapeCronJobs(cronJobs []*clustercache.CronJob, namespaceIndex map[string]types.UID) ScrapeFunc {
  845. return func() []metric.Update {
  846. return ccs.scrapeCronJobs(cronJobs, namespaceIndex)
  847. }
  848. }
  849. func (ccs *ClusterCacheScraper) scrapeCronJobs(cronJobs []*clustercache.CronJob, namespaceIndex map[string]types.UID) []metric.Update {
  850. var scrapeResults []metric.Update
  851. for _, cronJob := range cronJobs {
  852. nsUID, ok := namespaceIndex[cronJob.Namespace]
  853. if !ok {
  854. log.Debugf("cronjob namespaceUID missing from index for namespace name '%s'", cronJob.Namespace)
  855. }
  856. cronJobInfo := map[string]string{
  857. source.UIDLabel: string(cronJob.UID),
  858. source.NamespaceUIDLabel: string(nsUID),
  859. source.CronJobLabel: cronJob.Name,
  860. }
  861. // cronjob info
  862. scrapeResults = append(scrapeResults, metric.Update{
  863. Name: metric.CronJobInfo,
  864. Labels: cronJobInfo,
  865. Value: 0,
  866. AdditionalInfo: cronJobInfo,
  867. })
  868. // cronjob labels
  869. labelNames, labelValues := promutil.KubeLabelsToLabels(cronJob.Labels)
  870. cronJobLabels := util.ToMap(labelNames, labelValues)
  871. scrapeResults = append(scrapeResults, metric.Update{
  872. Name: metric.CronJobLabels,
  873. Labels: cronJobInfo,
  874. Value: 0,
  875. AdditionalInfo: cronJobLabels,
  876. })
  877. // cronjob annotations
  878. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(cronJob.Annotations)
  879. cronJobAnnotations := util.ToMap(annotationNames, annotationValues)
  880. scrapeResults = append(scrapeResults, metric.Update{
  881. Name: metric.CronJobAnnotations,
  882. Labels: cronJobInfo,
  883. Value: 0,
  884. AdditionalInfo: cronJobAnnotations,
  885. })
  886. }
  887. events.Dispatch(event.ScrapeEvent{
  888. ScraperName: event.KubernetesClusterScraperName,
  889. ScrapeType: event.CronJobScraperType,
  890. Targets: len(cronJobs),
  891. Errors: nil,
  892. })
  893. return scrapeResults
  894. }
  895. func (ccs *ClusterCacheScraper) GetScrapeReplicaSets(replicaSets []*clustercache.ReplicaSet, namespaceIndex map[string]types.UID) ScrapeFunc {
  896. return func() []metric.Update {
  897. return ccs.scrapeReplicaSets(replicaSets, namespaceIndex)
  898. }
  899. }
  900. func (ccs *ClusterCacheScraper) scrapeReplicaSets(replicaSets []*clustercache.ReplicaSet, namespaceIndex map[string]types.UID) []metric.Update {
  901. var scrapeResults []metric.Update
  902. for _, replicaSet := range replicaSets {
  903. nsUID, ok := namespaceIndex[replicaSet.Namespace]
  904. if !ok {
  905. log.Debugf("replicaset namespaceUID missing from index for namespace name '%s'", replicaSet.Namespace)
  906. }
  907. replicaSetInfo := map[string]string{
  908. source.UIDLabel: string(replicaSet.UID),
  909. source.NamespaceUIDLabel: string(nsUID),
  910. source.ReplicaSetLabel: replicaSet.Name,
  911. }
  912. // replicaset info
  913. scrapeResults = append(scrapeResults, metric.Update{
  914. Name: metric.ReplicaSetInfo,
  915. Labels: replicaSetInfo,
  916. Value: 0,
  917. AdditionalInfo: replicaSetInfo,
  918. })
  919. // replicaset labels
  920. labelNames, labelValues := promutil.KubeLabelsToLabels(replicaSet.Labels)
  921. replicaSetLabels := util.ToMap(labelNames, labelValues)
  922. scrapeResults = append(scrapeResults, metric.Update{
  923. Name: metric.ReplicaSetLabels,
  924. Labels: replicaSetInfo,
  925. Value: 0,
  926. AdditionalInfo: replicaSetLabels,
  927. })
  928. // replicaset annotations
  929. annotationNames, annotationValues := promutil.KubeAnnotationsToLabels(replicaSet.Annotations)
  930. replicaSetAnnotations := util.ToMap(annotationNames, annotationValues)
  931. scrapeResults = append(scrapeResults, metric.Update{
  932. Name: metric.ReplicaSetAnnotations,
  933. Labels: replicaSetInfo,
  934. Value: 0,
  935. AdditionalInfo: replicaSetAnnotations,
  936. })
  937. // owner references for backward compatibility
  938. replicaSetOwnerInfo := map[string]string{
  939. source.ReplicaSetLabel: replicaSet.Name,
  940. source.NamespaceLabel: replicaSet.Namespace,
  941. source.UIDLabel: string(replicaSet.UID),
  942. }
  943. // this specific metric exports a special <none> value for name and kind
  944. // if there are no owners
  945. if len(replicaSet.OwnerReferences) == 0 {
  946. ownerInfo := maps.Clone(replicaSetOwnerInfo)
  947. ownerInfo[source.OwnerKindLabel] = source.NoneLabelValue
  948. ownerInfo[source.OwnerNameLabel] = source.NoneLabelValue
  949. ownerInfo[source.ControllerLabel] = "false"
  950. scrapeResults = append(scrapeResults, metric.Update{
  951. Name: metric.KubeReplicasetOwner,
  952. Labels: ownerInfo,
  953. Value: 0,
  954. AdditionalInfo: ownerInfo,
  955. })
  956. } else {
  957. for _, owner := range replicaSet.OwnerReferences {
  958. controller := "false"
  959. if owner.Controller != nil && *owner.Controller {
  960. controller = "true"
  961. }
  962. ownerInfo := maps.Clone(replicaSetOwnerInfo)
  963. ownerInfo[source.OwnerKindLabel] = owner.Kind
  964. ownerInfo[source.OwnerNameLabel] = owner.Name
  965. ownerInfo[source.OwnerUIDLabel] = string(owner.UID)
  966. ownerInfo[source.ControllerLabel] = controller
  967. scrapeResults = append(scrapeResults, metric.Update{
  968. Name: metric.KubeReplicasetOwner,
  969. Labels: ownerInfo,
  970. Value: 0,
  971. AdditionalInfo: ownerInfo,
  972. })
  973. }
  974. }
  975. }
  976. events.Dispatch(event.ScrapeEvent{
  977. ScraperName: event.KubernetesClusterScraperName,
  978. ScrapeType: event.ReplicaSetScraperType,
  979. Targets: len(replicaSets),
  980. Errors: nil,
  981. })
  982. return scrapeResults
  983. }
  984. func (ccs *ClusterCacheScraper) GetScrapeResourceQuotas(resourceQuotas []*clustercache.ResourceQuota, namespaceIndex map[string]types.UID) ScrapeFunc {
  985. return func() []metric.Update {
  986. return ccs.scrapeResourceQuotas(resourceQuotas, namespaceIndex)
  987. }
  988. }
  989. func (ccs *ClusterCacheScraper) scrapeResourceQuotas(resourceQuotas []*clustercache.ResourceQuota, namespaceIndex map[string]types.UID) []metric.Update {
  990. var scrapeResults []metric.Update
  991. processResource := func(baseLabels map[string]string, name v1.ResourceName, quantity resource.Quantity, metricName string) metric.Update {
  992. resource, unit, value := toResourceUnitValue(name, quantity)
  993. labels := maps.Clone(baseLabels)
  994. labels[source.ResourceLabel] = resource
  995. labels[source.UnitLabel] = unit
  996. return metric.Update{
  997. Name: metricName,
  998. Labels: labels,
  999. Value: value,
  1000. }
  1001. }
  1002. for _, resourceQuota := range resourceQuotas {
  1003. nsUID, _ := namespaceIndex[resourceQuota.Namespace]
  1004. resourceQuotaInfo := map[string]string{
  1005. source.UIDLabel: string(resourceQuota.UID),
  1006. source.NamespaceUIDLabel: string(nsUID),
  1007. source.ResourceQuotaLabel: resourceQuota.Name,
  1008. }
  1009. scrapeResults = append(scrapeResults, metric.Update{
  1010. Name: metric.ResourceQuotaInfo,
  1011. Labels: resourceQuotaInfo,
  1012. AdditionalInfo: resourceQuotaInfo,
  1013. Value: 0,
  1014. })
  1015. if resourceQuota.Spec.Hard != nil {
  1016. // CPU/memory requests can also be aliased as "cpu" and "memory". For now, however, only scrape the complete names
  1017. // https://kubernetes.io/docs/concepts/policy/resource-quotas/#compute-resource-quota
  1018. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceRequestsCPU]; ok {
  1019. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaSpecResourceRequests))
  1020. }
  1021. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceRequestsMemory]; ok {
  1022. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaSpecResourceRequests))
  1023. }
  1024. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceLimitsCPU]; ok {
  1025. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaSpecResourceLimits))
  1026. }
  1027. if quantity, ok := resourceQuota.Spec.Hard[v1.ResourceLimitsMemory]; ok {
  1028. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaSpecResourceLimits))
  1029. }
  1030. }
  1031. if resourceQuota.Status.Used != nil {
  1032. if quantity, ok := resourceQuota.Status.Used[v1.ResourceRequestsCPU]; ok {
  1033. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaStatusUsedResourceRequests))
  1034. }
  1035. if quantity, ok := resourceQuota.Status.Used[v1.ResourceRequestsMemory]; ok {
  1036. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaStatusUsedResourceRequests))
  1037. }
  1038. if quantity, ok := resourceQuota.Status.Used[v1.ResourceLimitsCPU]; ok {
  1039. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceCPU, quantity, metric.KubeResourceQuotaStatusUsedResourceLimits))
  1040. }
  1041. if quantity, ok := resourceQuota.Status.Used[v1.ResourceLimitsMemory]; ok {
  1042. scrapeResults = append(scrapeResults, processResource(resourceQuotaInfo, v1.ResourceMemory, quantity, metric.KubeResourceQuotaStatusUsedResourceLimits))
  1043. }
  1044. }
  1045. }
  1046. events.Dispatch(event.ScrapeEvent{
  1047. ScraperName: event.KubernetesClusterScraperName,
  1048. ScrapeType: event.ResourceQuotaScraperType,
  1049. Targets: len(resourceQuotas),
  1050. Errors: nil,
  1051. })
  1052. return scrapeResults
  1053. }
  1054. // PvcInfo is used to store information about a pvc for tracking volume usage.
  1055. type PvcInfo struct {
  1056. Class string
  1057. Claim string
  1058. Namespace string
  1059. VolumeName string
  1060. Requests float64
  1061. PodsClaimed []string
  1062. }
  1063. func getPvcsInfo(pvcs []*clustercache.PersistentVolumeClaim) map[string]*PvcInfo {
  1064. toReturn := make(map[string]*PvcInfo)
  1065. for _, pvc := range pvcs {
  1066. ns := pvc.Namespace
  1067. pvcName := pvc.Name
  1068. volumeName := pvc.Spec.VolumeName
  1069. pvClass := getPersistentVolumeClaimClass(pvc)
  1070. requests := float64(pvc.Spec.Resources.Requests.Storage().Value())
  1071. key := ns + "," + pvcName
  1072. toReturn[key] = &PvcInfo{
  1073. Class: pvClass,
  1074. Claim: pvcName,
  1075. Namespace: ns,
  1076. VolumeName: volumeName,
  1077. Requests: requests,
  1078. }
  1079. }
  1080. return toReturn
  1081. }
  1082. // NodeGpuInfo contains the gpu count and vgpu counts for nodes
  1083. type NodeGpuInfo struct {
  1084. GPU float64
  1085. VGPU float64
  1086. }
  1087. func (ccs *ClusterCacheScraper) getNodesGpuInfo() map[string]*NodeGpuInfo {
  1088. // use a closure to cache allocatableVGPU result instead of calculating
  1089. // it every time we need it
  1090. var allocatableVGPUs *float64
  1091. allocVGPUs := func() (float64, error) {
  1092. if allocatableVGPUs != nil {
  1093. return *allocatableVGPUs, nil
  1094. }
  1095. vgpu, err := getAllocatableVGPUs(ccs.clusterCache.GetAllDaemonSets())
  1096. if err != nil {
  1097. return vgpu, err
  1098. }
  1099. allocatableVGPUs = &vgpu
  1100. return *allocatableVGPUs, nil
  1101. }
  1102. var nodeGpuMap map[string]*NodeGpuInfo = make(map[string]*NodeGpuInfo)
  1103. for _, node := range ccs.clusterCache.GetAllNodes() {
  1104. info, err := gpuInfoFor(node, allocVGPUs)
  1105. if err != nil {
  1106. log.Warnf("Failed to retrieve GPU Info for Node: %s - %s", node.Name, err)
  1107. continue
  1108. }
  1109. nodeGpuMap[node.Name] = info
  1110. }
  1111. return nodeGpuMap
  1112. }
  1113. // getPersistentVolumeClaimClass returns StorageClassName. If no storage class was
  1114. // requested, it returns "".
  1115. func getPersistentVolumeClaimClass(claim *clustercache.PersistentVolumeClaim) string {
  1116. // Use beta annotation first
  1117. if class, found := claim.Annotations[v1.BetaStorageClassAnnotation]; found {
  1118. return class
  1119. }
  1120. if claim.Spec.StorageClassName != nil {
  1121. return *claim.Spec.StorageClassName
  1122. }
  1123. // Special non-empty string to indicate absence of storage class.
  1124. return ""
  1125. }
  1126. // toResourceUnitValue accepts a resource name and quantity and returns the sanitized resource, the unit, and the value in the units.
  1127. // Returns an empty string for resource and unit if there was a failure.
  1128. func toResourceUnitValue(resourceName v1.ResourceName, quantity resource.Quantity) (resource string, unit string, value float64) {
  1129. resource = promutil.SanitizeLabelName(string(resourceName))
  1130. switch resourceName {
  1131. case v1.ResourceCPU:
  1132. unit = "core"
  1133. value = float64(quantity.MilliValue()) / 1000
  1134. return
  1135. case v1.ResourceStorage:
  1136. fallthrough
  1137. case v1.ResourceEphemeralStorage:
  1138. fallthrough
  1139. case v1.ResourceMemory:
  1140. unit = "byte"
  1141. value = float64(quantity.Value())
  1142. return
  1143. case v1.ResourcePods:
  1144. unit = "integer"
  1145. value = float64(quantity.Value())
  1146. return
  1147. default:
  1148. if isHugePageResourceName(resourceName) || isAttachableVolumeResourceName(resourceName) {
  1149. unit = "byte"
  1150. value = float64(quantity.Value())
  1151. return
  1152. }
  1153. if isExtendedResourceName(resourceName) {
  1154. unit = "integer"
  1155. value = float64(quantity.Value())
  1156. return
  1157. }
  1158. }
  1159. resource = ""
  1160. unit = ""
  1161. value = 0.0
  1162. return
  1163. }
  1164. func isGpuResourceName(name v1.ResourceName) bool {
  1165. return name == "nvidia.com/gpu" || name == "k8s.amazonaws.com/vgpu"
  1166. }
  1167. // isHugePageResourceName checks for a huge page container resource name
  1168. func isHugePageResourceName(name v1.ResourceName) bool {
  1169. return strings.HasPrefix(string(name), v1.ResourceHugePagesPrefix)
  1170. }
  1171. // isAttachableVolumeResourceName checks for attached volume container resource name
  1172. func isAttachableVolumeResourceName(name v1.ResourceName) bool {
  1173. return strings.HasPrefix(string(name), v1.ResourceAttachableVolumesPrefix)
  1174. }
  1175. // isExtendedResourceName checks for extended container resource name
  1176. func isExtendedResourceName(name v1.ResourceName) bool {
  1177. if isNativeResource(name) || strings.HasPrefix(string(name), v1.DefaultResourceRequestsPrefix) {
  1178. return false
  1179. }
  1180. // Ensure it satisfies the rules in IsQualifiedName() after converted into quota resource name
  1181. nameForQuota := fmt.Sprintf("%s%s", v1.DefaultResourceRequestsPrefix, string(name))
  1182. if errs := validation.IsQualifiedName(nameForQuota); len(errs) != 0 {
  1183. return false
  1184. }
  1185. return true
  1186. }
  1187. // isNativeResource checks for a kubernetes.io/ prefixed resource name
  1188. func isNativeResource(name v1.ResourceName) bool {
  1189. return !strings.Contains(string(name), "/") || isPrefixedNativeResource(name)
  1190. }
  1191. func isPrefixedNativeResource(name v1.ResourceName) bool {
  1192. return strings.Contains(string(name), v1.ResourceDefaultNamespacePrefix)
  1193. }
  1194. // gets the Node GPUs and VGPUs using the node data from k8s. Returns nil if GPUs could not be located for the node.
  1195. func gpuInfoFor(
  1196. n *clustercache.Node,
  1197. allocatedVGPUs func() (float64, error),
  1198. ) (*NodeGpuInfo, error) {
  1199. g, hasGpu := n.Status.Capacity["nvidia.com/gpu"]
  1200. _, hasReplicas := n.Labels["nvidia.com/gpu.replicas"]
  1201. // Case 1: Standard NVIDIA GPU
  1202. if hasGpu && g.Value() != 0 && !hasReplicas {
  1203. return &NodeGpuInfo{
  1204. GPU: float64(g.Value()),
  1205. VGPU: float64(g.Value()),
  1206. }, nil
  1207. }
  1208. // Case 2: NVIDIA GPU with GPU Feature Discovery (GFD) Pod enabled.
  1209. // Ref: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html#verifying-the-gpu-time-slicing-configuration
  1210. // Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L44-L45
  1211. // Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L103-L118
  1212. if hasReplicas {
  1213. resultGPU := 0.0
  1214. resultVGPU := 0.0
  1215. if c, ok := n.Labels["nvidia.com/gpu.count"]; ok {
  1216. var err error
  1217. resultGPU, err = strconv.ParseFloat(c, 64)
  1218. if err != nil {
  1219. return nil, fmt.Errorf("could not parse label \"nvidia.com/gpu.count\": %v", err)
  1220. }
  1221. }
  1222. if s, ok := n.Status.Capacity["nvidia.com/gpu.shared"]; ok { // GFD configured `renameByDefault=true`
  1223. resultVGPU = float64(s.Value())
  1224. } else if g, ok := n.Status.Capacity["nvidia.com/gpu"]; ok { // GFD configured `renameByDefault=false`
  1225. resultVGPU = float64(g.Value())
  1226. } else {
  1227. resultVGPU = resultGPU
  1228. }
  1229. return &NodeGpuInfo{
  1230. GPU: resultGPU,
  1231. VGPU: resultVGPU,
  1232. }, nil
  1233. }
  1234. // Case 3: AWS vGPU
  1235. if vgpu, ok := n.Status.Capacity["k8s.amazonaws.com/vgpu"]; ok {
  1236. vgpuCount, err := allocatedVGPUs()
  1237. if err != nil {
  1238. return nil, err
  1239. }
  1240. vgpuCoeff := 10.0
  1241. if vgpuCount > 0.0 {
  1242. vgpuCoeff = vgpuCount
  1243. }
  1244. if vgpu.Value() != 0 {
  1245. resultGPU := float64(vgpu.Value()) / vgpuCoeff
  1246. resultVGPU := float64(vgpu.Value())
  1247. return &NodeGpuInfo{
  1248. GPU: resultGPU,
  1249. VGPU: resultVGPU,
  1250. }, nil
  1251. }
  1252. }
  1253. // No GPU found
  1254. return nil, nil
  1255. }
  1256. func getAllocatableVGPUs(daemonsets []*clustercache.DaemonSet) (float64, error) {
  1257. vgpuCount := 0.0
  1258. for _, ds := range daemonsets {
  1259. dsContainerList := &ds.SpecContainers
  1260. for _, ctnr := range *dsContainerList {
  1261. if ctnr.Args != nil {
  1262. for _, arg := range ctnr.Args {
  1263. if strings.Contains(arg, "--vgpu=") {
  1264. vgpus, err := strconv.ParseFloat(arg[strings.IndexByte(arg, '=')+1:], 64)
  1265. if err != nil {
  1266. log.Errorf("failed to parse vgpu allocation string %s: %v", arg, err)
  1267. continue
  1268. }
  1269. vgpuCount = vgpus
  1270. return vgpuCount, nil
  1271. }
  1272. }
  1273. }
  1274. }
  1275. }
  1276. return vgpuCount, nil
  1277. }