metrics.go 52 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516
  1. package costmodel
  2. import (
  3. "math"
  4. "strconv"
  5. "strings"
  6. "sync"
  7. "time"
  8. "github.com/kubecost/cost-model/pkg/cloud"
  9. "github.com/kubecost/cost-model/pkg/clustercache"
  10. "github.com/kubecost/cost-model/pkg/env"
  11. "github.com/kubecost/cost-model/pkg/errors"
  12. "github.com/kubecost/cost-model/pkg/log"
  13. "github.com/kubecost/cost-model/pkg/prom"
  14. "github.com/kubecost/cost-model/pkg/util"
  15. promclient "github.com/prometheus/client_golang/api"
  16. "github.com/prometheus/client_golang/prometheus"
  17. dto "github.com/prometheus/client_model/go"
  18. v1 "k8s.io/api/core/v1"
  19. "k8s.io/client-go/kubernetes"
  20. "k8s.io/klog"
  21. )
  22. //--------------------------------------------------------------------------
  23. // StatefulsetCollector
  24. //--------------------------------------------------------------------------
  25. // StatefulsetCollector is a prometheus collector that generates StatefulsetMetrics
  26. type StatefulsetCollector struct {
  27. KubeClusterCache clustercache.ClusterCache
  28. }
  29. // Describe sends the super-set of all possible descriptors of metrics
  30. // collected by this Collector.
  31. func (sc StatefulsetCollector) Describe(ch chan<- *prometheus.Desc) {
  32. ch <- prometheus.NewDesc("statefulSet_match_labels", "statfulSet match labels", []string{}, nil)
  33. }
  34. // Collect is called by the Prometheus registry when collecting metrics.
  35. func (sc StatefulsetCollector) Collect(ch chan<- prometheus.Metric) {
  36. ds := sc.KubeClusterCache.GetAllStatefulSets()
  37. for _, statefulset := range ds {
  38. labels, values := prom.KubeLabelsToLabels(statefulset.Spec.Selector.MatchLabels)
  39. if len(labels) > 0 {
  40. m := newStatefulsetMetric(statefulset.GetName(), statefulset.GetNamespace(), "statefulSet_match_labels", labels, values)
  41. ch <- m
  42. }
  43. }
  44. }
  45. //--------------------------------------------------------------------------
  46. // StatefulsetMetric
  47. //--------------------------------------------------------------------------
  48. // StatefulsetMetric is a prometheus.Metric used to encode statefulset match labels
  49. type StatefulsetMetric struct {
  50. fqName string
  51. help string
  52. labelNames []string
  53. labelValues []string
  54. statefulsetName string
  55. namespace string
  56. }
  57. // Creates a new StatefulsetMetric, implementation of prometheus.Metric
  58. func newStatefulsetMetric(name, namespace, fqname string, labelNames []string, labelvalues []string) StatefulsetMetric {
  59. return StatefulsetMetric{
  60. fqName: fqname,
  61. labelNames: labelNames,
  62. labelValues: labelvalues,
  63. help: "statefulSet_match_labels StatefulSet Match Labels",
  64. statefulsetName: name,
  65. namespace: namespace,
  66. }
  67. }
  68. // Desc returns the descriptor for the Metric. This method idempotently
  69. // returns the same descriptor throughout the lifetime of the Metric.
  70. func (s StatefulsetMetric) Desc() *prometheus.Desc {
  71. l := prometheus.Labels{"statefulSet": s.statefulsetName, "namespace": s.namespace}
  72. return prometheus.NewDesc(s.fqName, s.help, s.labelNames, l)
  73. }
  74. // Write encodes the Metric into a "Metric" Protocol Buffer data
  75. // transmission object.
  76. func (s StatefulsetMetric) Write(m *dto.Metric) error {
  77. h := float64(1)
  78. m.Gauge = &dto.Gauge{
  79. Value: &h,
  80. }
  81. var labels []*dto.LabelPair
  82. for i := range s.labelNames {
  83. labels = append(labels, &dto.LabelPair{
  84. Name: &s.labelNames[i],
  85. Value: &s.labelValues[i],
  86. })
  87. }
  88. n := "namespace"
  89. labels = append(labels, &dto.LabelPair{
  90. Name: &n,
  91. Value: &s.namespace,
  92. })
  93. r := "statefulSet"
  94. labels = append(labels, &dto.LabelPair{
  95. Name: &r,
  96. Value: &s.statefulsetName,
  97. })
  98. m.Label = labels
  99. return nil
  100. }
  101. //--------------------------------------------------------------------------
  102. // DeploymentCollector
  103. //--------------------------------------------------------------------------
  104. // DeploymentCollector is a prometheus collector that generates DeploymentMetrics
  105. type DeploymentCollector struct {
  106. KubeClusterCache clustercache.ClusterCache
  107. }
  108. // Describe sends the super-set of all possible descriptors of metrics
  109. // collected by this Collector.
  110. func (sc DeploymentCollector) Describe(ch chan<- *prometheus.Desc) {
  111. ch <- prometheus.NewDesc("deployment_match_labels", "deployment match labels", []string{}, nil)
  112. }
  113. // Collect is called by the Prometheus registry when collecting metrics.
  114. func (sc DeploymentCollector) Collect(ch chan<- prometheus.Metric) {
  115. ds := sc.KubeClusterCache.GetAllDeployments()
  116. for _, deployment := range ds {
  117. labels, values := prom.KubeLabelsToLabels(deployment.Spec.Selector.MatchLabels)
  118. if len(labels) > 0 {
  119. m := newDeploymentMetric(deployment.GetName(), deployment.GetNamespace(), "deployment_match_labels", labels, values)
  120. ch <- m
  121. }
  122. }
  123. }
  124. //--------------------------------------------------------------------------
  125. // DeploymentMetric
  126. //--------------------------------------------------------------------------
  127. // DeploymentMetric is a prometheus.Metric used to encode deployment match labels
  128. type DeploymentMetric struct {
  129. fqName string
  130. help string
  131. labelNames []string
  132. labelValues []string
  133. deploymentName string
  134. namespace string
  135. }
  136. // Creates a new DeploymentMetric, implementation of prometheus.Metric
  137. func newDeploymentMetric(name, namespace, fqname string, labelNames []string, labelvalues []string) DeploymentMetric {
  138. return DeploymentMetric{
  139. fqName: fqname,
  140. labelNames: labelNames,
  141. labelValues: labelvalues,
  142. help: "deployment_match_labels Deployment Match Labels",
  143. deploymentName: name,
  144. namespace: namespace,
  145. }
  146. }
  147. // Desc returns the descriptor for the Metric. This method idempotently
  148. // returns the same descriptor throughout the lifetime of the Metric.
  149. func (s DeploymentMetric) Desc() *prometheus.Desc {
  150. l := prometheus.Labels{"deployment": s.deploymentName, "namespace": s.namespace}
  151. return prometheus.NewDesc(s.fqName, s.help, s.labelNames, l)
  152. }
  153. // Write encodes the Metric into a "Metric" Protocol Buffer data
  154. // transmission object.
  155. func (s DeploymentMetric) Write(m *dto.Metric) error {
  156. h := float64(1)
  157. m.Gauge = &dto.Gauge{
  158. Value: &h,
  159. }
  160. var labels []*dto.LabelPair
  161. for i := range s.labelNames {
  162. labels = append(labels, &dto.LabelPair{
  163. Name: &s.labelNames[i],
  164. Value: &s.labelValues[i],
  165. })
  166. }
  167. n := "namespace"
  168. labels = append(labels, &dto.LabelPair{
  169. Name: &n,
  170. Value: &s.namespace,
  171. })
  172. r := "deployment"
  173. labels = append(labels, &dto.LabelPair{
  174. Name: &r,
  175. Value: &s.deploymentName,
  176. })
  177. m.Label = labels
  178. return nil
  179. }
  180. //--------------------------------------------------------------------------
  181. // ServiceCollector
  182. //--------------------------------------------------------------------------
  183. // ServiceCollector is a prometheus collector that generates ServiceMetrics
  184. type ServiceCollector struct {
  185. KubeClusterCache clustercache.ClusterCache
  186. }
  187. // Describe sends the super-set of all possible descriptors of metrics
  188. // collected by this Collector.
  189. func (sc ServiceCollector) Describe(ch chan<- *prometheus.Desc) {
  190. ch <- prometheus.NewDesc("service_selector_labels", "service selector labels", []string{}, nil)
  191. }
  192. // Collect is called by the Prometheus registry when collecting metrics.
  193. func (sc ServiceCollector) Collect(ch chan<- prometheus.Metric) {
  194. svcs := sc.KubeClusterCache.GetAllServices()
  195. for _, svc := range svcs {
  196. labels, values := prom.KubeLabelsToLabels(svc.Spec.Selector)
  197. if len(labels) > 0 {
  198. m := newServiceMetric(svc.GetName(), svc.GetNamespace(), "service_selector_labels", labels, values)
  199. ch <- m
  200. }
  201. }
  202. }
  203. //--------------------------------------------------------------------------
  204. // ServiceMetric
  205. //--------------------------------------------------------------------------
  206. // ServiceMetric is a prometheus.Metric used to encode service selector labels
  207. type ServiceMetric struct {
  208. fqName string
  209. help string
  210. labelNames []string
  211. labelValues []string
  212. serviceName string
  213. namespace string
  214. }
  215. // Creates a new ServiceMetric, implementation of prometheus.Metric
  216. func newServiceMetric(name, namespace, fqname string, labelNames []string, labelvalues []string) ServiceMetric {
  217. return ServiceMetric{
  218. fqName: fqname,
  219. labelNames: labelNames,
  220. labelValues: labelvalues,
  221. help: "service_selector_labels Service Selector Labels",
  222. serviceName: name,
  223. namespace: namespace,
  224. }
  225. }
  226. // Desc returns the descriptor for the Metric. This method idempotently
  227. // returns the same descriptor throughout the lifetime of the Metric.
  228. func (s ServiceMetric) Desc() *prometheus.Desc {
  229. l := prometheus.Labels{"service": s.serviceName, "namespace": s.namespace}
  230. return prometheus.NewDesc(s.fqName, s.help, s.labelNames, l)
  231. }
  232. // Write encodes the Metric into a "Metric" Protocol Buffer data
  233. // transmission object.
  234. func (s ServiceMetric) Write(m *dto.Metric) error {
  235. h := float64(1)
  236. m.Gauge = &dto.Gauge{
  237. Value: &h,
  238. }
  239. var labels []*dto.LabelPair
  240. for i := range s.labelNames {
  241. labels = append(labels, &dto.LabelPair{
  242. Name: &s.labelNames[i],
  243. Value: &s.labelValues[i],
  244. })
  245. }
  246. n := "namespace"
  247. labels = append(labels, &dto.LabelPair{
  248. Name: &n,
  249. Value: &s.namespace,
  250. })
  251. r := "service"
  252. labels = append(labels, &dto.LabelPair{
  253. Name: &r,
  254. Value: &s.serviceName,
  255. })
  256. m.Label = labels
  257. return nil
  258. }
  259. //--------------------------------------------------------------------------
  260. // NamespaceAnnotationCollector
  261. //--------------------------------------------------------------------------
  262. // NamespaceAnnotationCollector is a prometheus collector that generates NamespaceAnnotationMetrics
  263. type NamespaceAnnotationCollector struct {
  264. KubeClusterCache clustercache.ClusterCache
  265. }
  266. // Describe sends the super-set of all possible descriptors of metrics
  267. // collected by this Collector.
  268. func (nsac NamespaceAnnotationCollector) Describe(ch chan<- *prometheus.Desc) {
  269. ch <- prometheus.NewDesc("kube_namespace_annotations", "namespace annotations", []string{}, nil)
  270. }
  271. // Collect is called by the Prometheus registry when collecting metrics.
  272. func (nsac NamespaceAnnotationCollector) Collect(ch chan<- prometheus.Metric) {
  273. namespaces := nsac.KubeClusterCache.GetAllNamespaces()
  274. for _, namespace := range namespaces {
  275. labels, values := prom.KubeAnnotationsToLabels(namespace.Annotations)
  276. if len(labels) > 0 {
  277. m := newNamespaceAnnotationsMetric(namespace.GetName(), "kube_namespace_annotations", labels, values)
  278. ch <- m
  279. }
  280. }
  281. }
  282. //--------------------------------------------------------------------------
  283. // NamespaceAnnotationsMetric
  284. //--------------------------------------------------------------------------
  285. // NamespaceAnnotationsMetric is a prometheus.Metric used to encode namespace annotations
  286. type NamespaceAnnotationsMetric struct {
  287. fqName string
  288. help string
  289. labelNames []string
  290. labelValues []string
  291. namespace string
  292. }
  293. // Creates a new NamespaceAnnotationsMetric, implementation of prometheus.Metric
  294. func newNamespaceAnnotationsMetric(namespace, fqname string, labelNames []string, labelValues []string) NamespaceAnnotationsMetric {
  295. return NamespaceAnnotationsMetric{
  296. namespace: namespace,
  297. fqName: fqname,
  298. labelNames: labelNames,
  299. labelValues: labelValues,
  300. help: "kube_namespace_annotations Namespace Annotations",
  301. }
  302. }
  303. // Desc returns the descriptor for the Metric. This method idempotently
  304. // returns the same descriptor throughout the lifetime of the Metric.
  305. func (nam NamespaceAnnotationsMetric) Desc() *prometheus.Desc {
  306. l := prometheus.Labels{"namespace": nam.namespace}
  307. return prometheus.NewDesc(nam.fqName, nam.help, nam.labelNames, l)
  308. }
  309. // Write encodes the Metric into a "Metric" Protocol Buffer data
  310. // transmission object.
  311. func (nam NamespaceAnnotationsMetric) Write(m *dto.Metric) error {
  312. h := float64(1)
  313. m.Gauge = &dto.Gauge{
  314. Value: &h,
  315. }
  316. var labels []*dto.LabelPair
  317. for i := range nam.labelNames {
  318. labels = append(labels, &dto.LabelPair{
  319. Name: &nam.labelNames[i],
  320. Value: &nam.labelValues[i],
  321. })
  322. }
  323. n := "namespace"
  324. labels = append(labels, &dto.LabelPair{
  325. Name: &n,
  326. Value: &nam.namespace,
  327. })
  328. m.Label = labels
  329. return nil
  330. }
  331. //--------------------------------------------------------------------------
  332. // PodAnnotationCollector
  333. //--------------------------------------------------------------------------
  334. // PodAnnotationCollector is a prometheus collector that generates PodAnnotationMetrics
  335. type PodAnnotationCollector struct {
  336. KubeClusterCache clustercache.ClusterCache
  337. }
  338. // Describe sends the super-set of all possible descriptors of metrics
  339. // collected by this Collector.
  340. func (pac PodAnnotationCollector) Describe(ch chan<- *prometheus.Desc) {
  341. ch <- prometheus.NewDesc("kube_pod_annotations", "pod annotations", []string{}, nil)
  342. }
  343. // Collect is called by the Prometheus registry when collecting metrics.
  344. func (pac PodAnnotationCollector) Collect(ch chan<- prometheus.Metric) {
  345. pods := pac.KubeClusterCache.GetAllPods()
  346. for _, pod := range pods {
  347. labels, values := prom.KubeAnnotationsToLabels(pod.Annotations)
  348. if len(labels) > 0 {
  349. m := newPodAnnotationMetric(pod.GetNamespace(), pod.GetName(), "kube_pod_annotations", labels, values)
  350. ch <- m
  351. }
  352. }
  353. }
  354. //--------------------------------------------------------------------------
  355. // PodAnnotationsMetric
  356. //--------------------------------------------------------------------------
  357. // PodAnnotationsMetric is a prometheus.Metric used to encode namespace annotations
  358. type PodAnnotationsMetric struct {
  359. name string
  360. fqName string
  361. help string
  362. labelNames []string
  363. labelValues []string
  364. namespace string
  365. }
  366. // Creates a new PodAnnotationsMetric, implementation of prometheus.Metric
  367. func newPodAnnotationMetric(namespace, name, fqname string, labelNames []string, labelValues []string) PodAnnotationsMetric {
  368. return PodAnnotationsMetric{
  369. namespace: namespace,
  370. name: name,
  371. fqName: fqname,
  372. labelNames: labelNames,
  373. labelValues: labelValues,
  374. help: "kube_pod_annotations Pod Annotations",
  375. }
  376. }
  377. // Desc returns the descriptor for the Metric. This method idempotently
  378. // returns the same descriptor throughout the lifetime of the Metric.
  379. func (pam PodAnnotationsMetric) Desc() *prometheus.Desc {
  380. l := prometheus.Labels{"namespace": pam.namespace, "pod": pam.name}
  381. return prometheus.NewDesc(pam.fqName, pam.help, pam.labelNames, l)
  382. }
  383. // Write encodes the Metric into a "Metric" Protocol Buffer data
  384. // transmission object.
  385. func (pam PodAnnotationsMetric) Write(m *dto.Metric) error {
  386. h := float64(1)
  387. m.Gauge = &dto.Gauge{
  388. Value: &h,
  389. }
  390. var labels []*dto.LabelPair
  391. for i := range pam.labelNames {
  392. labels = append(labels, &dto.LabelPair{
  393. Name: &pam.labelNames[i],
  394. Value: &pam.labelValues[i],
  395. })
  396. }
  397. n := "namespace"
  398. labels = append(labels, &dto.LabelPair{
  399. Name: &n,
  400. Value: &pam.namespace,
  401. })
  402. r := "pod"
  403. labels = append(labels, &dto.LabelPair{
  404. Name: &r,
  405. Value: &pam.name,
  406. })
  407. m.Label = labels
  408. return nil
  409. }
  410. //--------------------------------------------------------------------------
  411. // ClusterInfoCollector
  412. //--------------------------------------------------------------------------
  413. // ClusterInfoCollector is a prometheus collector that generates ClusterInfoMetrics
  414. type ClusterInfoCollector struct {
  415. Cloud cloud.Provider
  416. KubeClientSet kubernetes.Interface
  417. }
  418. // Describe sends the super-set of all possible descriptors of metrics
  419. // collected by this Collector.
  420. func (cic ClusterInfoCollector) Describe(ch chan<- *prometheus.Desc) {
  421. ch <- prometheus.NewDesc("kubecost_cluster_info", "Kubecost Cluster Info", []string{}, nil)
  422. }
  423. // Collect is called by the Prometheus registry when collecting metrics.
  424. func (cic ClusterInfoCollector) Collect(ch chan<- prometheus.Metric) {
  425. clusterInfo := GetClusterInfo(cic.KubeClientSet, cic.Cloud)
  426. labels := prom.MapToLabels(clusterInfo)
  427. m := newClusterInfoMetric("kubecost_cluster_info", labels)
  428. ch <- m
  429. }
  430. //--------------------------------------------------------------------------
  431. // ClusterInfoMetric
  432. //--------------------------------------------------------------------------
  433. // ClusterInfoMetric is a prometheus.Metric used to encode the local cluster info
  434. type ClusterInfoMetric struct {
  435. fqName string
  436. help string
  437. labels map[string]string
  438. }
  439. // Creates a new ClusterInfoMetric, implementation of prometheus.Metric
  440. func newClusterInfoMetric(fqName string, labels map[string]string) ClusterInfoMetric {
  441. return ClusterInfoMetric{
  442. fqName: fqName,
  443. labels: labels,
  444. help: "kubecost_cluster_info ClusterInfo",
  445. }
  446. }
  447. // Desc returns the descriptor for the Metric. This method idempotently
  448. // returns the same descriptor throughout the lifetime of the Metric.
  449. func (cim ClusterInfoMetric) Desc() *prometheus.Desc {
  450. l := prometheus.Labels{}
  451. return prometheus.NewDesc(cim.fqName, cim.help, prom.LabelNamesFrom(cim.labels), l)
  452. }
  453. // Write encodes the Metric into a "Metric" Protocol Buffer data
  454. // transmission object.
  455. func (cim ClusterInfoMetric) Write(m *dto.Metric) error {
  456. h := float64(1)
  457. m.Gauge = &dto.Gauge{
  458. Value: &h,
  459. }
  460. var labels []*dto.LabelPair
  461. for k, v := range cim.labels {
  462. labels = append(labels, &dto.LabelPair{
  463. Name: toStringPtr(k),
  464. Value: toStringPtr(v),
  465. })
  466. }
  467. m.Label = labels
  468. return nil
  469. }
  470. //--------------------------------------------------------------------------
  471. // KubeNodeStatusCapacityMemoryBytesCollector
  472. //--------------------------------------------------------------------------
  473. // KubeNodeStatusCapacityMemoryBytesCollector is a prometheus collector that generates
  474. // KubeNodeStatusCapacityMemoryBytesMetrics
  475. type KubeNodeStatusCapacityMemoryBytesCollector struct {
  476. KubeClusterCache clustercache.ClusterCache
  477. }
  478. // Describe sends the super-set of all possible descriptors of metrics
  479. // collected by this Collector.
  480. func (nsac KubeNodeStatusCapacityMemoryBytesCollector) Describe(ch chan<- *prometheus.Desc) {
  481. ch <- prometheus.NewDesc("kube_node_status_capacity_memory_bytes", "node capacity memory bytes", []string{}, nil)
  482. }
  483. // Collect is called by the Prometheus registry when collecting metrics.
  484. func (nsac KubeNodeStatusCapacityMemoryBytesCollector) Collect(ch chan<- prometheus.Metric) {
  485. nodes := nsac.KubeClusterCache.GetAllNodes()
  486. for _, node := range nodes {
  487. // k8s.io/apimachinery/pkg/api/resource/amount.go and
  488. // k8s.io/apimachinery/pkg/api/resource/quantity.go for
  489. // details on the "amount" API. See
  490. // https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-types
  491. // for the units of memory and CPU.
  492. memoryBytes := node.Status.Capacity.Memory().Value()
  493. m := newKubeNodeStatusCapacityMemoryBytesMetric(node.GetName(), memoryBytes, "kube_node_status_capacity_memory_bytes", nil, nil)
  494. ch <- m
  495. }
  496. }
  497. //--------------------------------------------------------------------------
  498. // KubeNodeStatusCapacityMemoryBytesMetric
  499. //--------------------------------------------------------------------------
  500. // KubeNodeStatusCapacityMemoryBytesMetric is a prometheus.Metric used to encode
  501. // a duplicate of the deprecated kube-state-metrics metric
  502. // kube_node_status_capacity_memory_bytes
  503. type KubeNodeStatusCapacityMemoryBytesMetric struct {
  504. fqName string
  505. help string
  506. labelNames []string
  507. labelValues []string
  508. bytes int64
  509. node string
  510. }
  511. // Creates a new KubeNodeStatusCapacityMemoryBytesMetric, implementation of prometheus.Metric
  512. func newKubeNodeStatusCapacityMemoryBytesMetric(node string, bytes int64, fqname string, labelNames []string, labelValues []string) KubeNodeStatusCapacityMemoryBytesMetric {
  513. return KubeNodeStatusCapacityMemoryBytesMetric{
  514. fqName: fqname,
  515. labelNames: labelNames,
  516. labelValues: labelValues,
  517. help: "kube_node_status_capacity_memory_bytes Node Capacity Memory Bytes",
  518. bytes: bytes,
  519. node: node,
  520. }
  521. }
  522. // Desc returns the descriptor for the Metric. This method idempotently
  523. // returns the same descriptor throughout the lifetime of the Metric.
  524. func (nam KubeNodeStatusCapacityMemoryBytesMetric) Desc() *prometheus.Desc {
  525. l := prometheus.Labels{"node": nam.node}
  526. return prometheus.NewDesc(nam.fqName, nam.help, nam.labelNames, l)
  527. }
  528. // Write encodes the Metric into a "Metric" Protocol Buffer data
  529. // transmission object.
  530. func (nam KubeNodeStatusCapacityMemoryBytesMetric) Write(m *dto.Metric) error {
  531. h := float64(nam.bytes)
  532. m.Gauge = &dto.Gauge{
  533. Value: &h,
  534. }
  535. var labels []*dto.LabelPair
  536. for i := range nam.labelNames {
  537. labels = append(labels, &dto.LabelPair{
  538. Name: &nam.labelNames[i],
  539. Value: &nam.labelValues[i],
  540. })
  541. }
  542. n := "node"
  543. labels = append(labels, &dto.LabelPair{
  544. Name: &n,
  545. Value: &nam.node,
  546. })
  547. m.Label = labels
  548. return nil
  549. }
  550. //--------------------------------------------------------------------------
  551. // KubeNodeStatusCapacityCPUCoresCollector
  552. //--------------------------------------------------------------------------
  553. // KubeNodeStatusCapacityCPUCoresCollector is a prometheus collector that generates
  554. // KubeNodeStatusCapacityCPUCoresMetrics
  555. type KubeNodeStatusCapacityCPUCoresCollector struct {
  556. KubeClusterCache clustercache.ClusterCache
  557. }
  558. // Describe sends the super-set of all possible descriptors of metrics
  559. // collected by this Collector.
  560. func (nsac KubeNodeStatusCapacityCPUCoresCollector) Describe(ch chan<- *prometheus.Desc) {
  561. ch <- prometheus.NewDesc("kube_node_status_capacity_cpu_cores", "node capacity cpu cores", []string{}, nil)
  562. }
  563. // Collect is called by the Prometheus registry when collecting metrics.
  564. func (nsac KubeNodeStatusCapacityCPUCoresCollector) Collect(ch chan<- prometheus.Metric) {
  565. nodes := nsac.KubeClusterCache.GetAllNodes()
  566. for _, node := range nodes {
  567. // k8s.io/apimachinery/pkg/api/resource/amount.go and
  568. // k8s.io/apimachinery/pkg/api/resource/quantity.go for
  569. // details on the "amount" API. See
  570. // https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-types
  571. // for the units of memory and CPU.
  572. cpuCores := float64(node.Status.Capacity.Cpu().MilliValue()) / 1000
  573. m := newKubeNodeStatusCapacityCPUCoresMetric(node.GetName(), cpuCores, "kube_node_status_capacity_cpu_cores", nil, nil)
  574. ch <- m
  575. }
  576. }
  577. //--------------------------------------------------------------------------
  578. // KubeNodeStatusCapacityCPUCoresMetric
  579. //--------------------------------------------------------------------------
  580. // KubeNodeStatusCapacityCPUCoresMetric is a prometheus.Metric used to encode
  581. // a duplicate of the deprecated kube-state-metrics metric
  582. // kube_node_status_capacity_memory_bytes
  583. type KubeNodeStatusCapacityCPUCoresMetric struct {
  584. fqName string
  585. help string
  586. labelNames []string
  587. labelValues []string
  588. cores float64
  589. node string
  590. }
  591. // Creates a new KubeNodeStatusCapacityCPUCoresMetric, implementation of prometheus.Metric
  592. func newKubeNodeStatusCapacityCPUCoresMetric(node string, cores float64, fqname string, labelNames []string, labelValues []string) KubeNodeStatusCapacityCPUCoresMetric {
  593. return KubeNodeStatusCapacityCPUCoresMetric{
  594. fqName: fqname,
  595. labelNames: labelNames,
  596. labelValues: labelValues,
  597. help: "kube_node_status_capacity_cpu_cores Node Capacity CPU Cores",
  598. cores: cores,
  599. node: node,
  600. }
  601. }
  602. // Desc returns the descriptor for the Metric. This method idempotently
  603. // returns the same descriptor throughout the lifetime of the Metric.
  604. func (nam KubeNodeStatusCapacityCPUCoresMetric) Desc() *prometheus.Desc {
  605. l := prometheus.Labels{"node": nam.node}
  606. return prometheus.NewDesc(nam.fqName, nam.help, nam.labelNames, l)
  607. }
  608. // Write encodes the Metric into a "Metric" Protocol Buffer data
  609. // transmission object.
  610. func (nam KubeNodeStatusCapacityCPUCoresMetric) Write(m *dto.Metric) error {
  611. h := nam.cores
  612. m.Gauge = &dto.Gauge{
  613. Value: &h,
  614. }
  615. var labels []*dto.LabelPair
  616. for i := range nam.labelNames {
  617. labels = append(labels, &dto.LabelPair{
  618. Name: &nam.labelNames[i],
  619. Value: &nam.labelValues[i],
  620. })
  621. }
  622. n := "node"
  623. labels = append(labels, &dto.LabelPair{
  624. Name: &n,
  625. Value: &nam.node,
  626. })
  627. m.Label = labels
  628. return nil
  629. }
  630. //--------------------------------------------------------------------------
  631. // KubePodLabelsCollector
  632. //--------------------------------------------------------------------------
  633. //
  634. // We use this to emit kube_pod_labels with all of a pod's labels, regardless
  635. // of the whitelist setting introduced in KSM v2. See
  636. // https://github.com/kubernetes/kube-state-metrics/issues/1270#issuecomment-712986441
  637. // KubePodLabelsCollector is a prometheus collector that generates
  638. // KubePodLabelsMetrics
  639. type KubePodLabelsCollector struct {
  640. KubeClusterCache clustercache.ClusterCache
  641. }
  642. // Describe sends the super-set of all possible descriptors of metrics
  643. // collected by this Collector.
  644. func (nsac KubePodLabelsCollector) Describe(ch chan<- *prometheus.Desc) {
  645. ch <- prometheus.NewDesc("kube_pod_labels", "all labels for each pod prefixed with label_", []string{}, nil)
  646. }
  647. // Collect is called by the Prometheus registry when collecting metrics.
  648. func (nsac KubePodLabelsCollector) Collect(ch chan<- prometheus.Metric) {
  649. pods := nsac.KubeClusterCache.GetAllPods()
  650. for _, pod := range pods {
  651. labelNames, labelValues := prom.KubePrependQualifierToLabels(pod.GetLabels(), "label_")
  652. m := newKubePodLabelsMetric(
  653. pod.GetName(),
  654. pod.GetNamespace(),
  655. string(pod.GetUID()),
  656. "kube_pod_labels",
  657. labelNames,
  658. labelValues,
  659. )
  660. ch <- m
  661. }
  662. }
  663. //--------------------------------------------------------------------------
  664. // KubePodLabelsMetric
  665. //--------------------------------------------------------------------------
  666. // KubePodLabelsMetric is a prometheus.Metric used to encode
  667. // a duplicate of the deprecated kube-state-metrics metric
  668. // kube_pod_labels
  669. type KubePodLabelsMetric struct {
  670. fqName string
  671. help string
  672. labelNames []string
  673. labelValues []string
  674. pod string
  675. namespace string
  676. uid string
  677. }
  678. // Creates a new KubePodLabelsMetric, implementation of prometheus.Metric
  679. func newKubePodLabelsMetric(pod string, namespace string, uid string, fqname string, labelNames []string, labelValues []string) KubePodLabelsMetric {
  680. return KubePodLabelsMetric{
  681. fqName: fqname,
  682. labelNames: labelNames,
  683. labelValues: labelValues,
  684. help: "kube_pod_labels all labels for each pod prefixed with label_",
  685. pod: pod,
  686. namespace: namespace,
  687. uid: uid,
  688. }
  689. }
  690. // Desc returns the descriptor for the Metric. This method idempotently
  691. // returns the same descriptor throughout the lifetime of the Metric.
  692. func (nam KubePodLabelsMetric) Desc() *prometheus.Desc {
  693. l := prometheus.Labels{
  694. "pod": nam.pod,
  695. "namespace": nam.namespace,
  696. "uid": nam.uid,
  697. }
  698. return prometheus.NewDesc(nam.fqName, nam.help, nam.labelNames, l)
  699. }
  700. // Write encodes the Metric into a "Metric" Protocol Buffer data
  701. // transmission object.
  702. func (nam KubePodLabelsMetric) Write(m *dto.Metric) error {
  703. h := float64(1)
  704. m.Gauge = &dto.Gauge{
  705. Value: &h,
  706. }
  707. var labels []*dto.LabelPair
  708. for i := range nam.labelNames {
  709. labels = append(labels, &dto.LabelPair{
  710. Name: &nam.labelNames[i],
  711. Value: &nam.labelValues[i],
  712. })
  713. }
  714. podString := "pod"
  715. namespaceString := "namespace"
  716. uidString := "uid"
  717. labels = append(labels,
  718. &dto.LabelPair{
  719. Name: &podString,
  720. Value: &nam.pod,
  721. },
  722. &dto.LabelPair{
  723. Name: &namespaceString,
  724. Value: &nam.namespace,
  725. }, &dto.LabelPair{
  726. Name: &uidString,
  727. Value: &nam.uid,
  728. },
  729. )
  730. m.Label = labels
  731. return nil
  732. }
  733. //--------------------------------------------------------------------------
  734. // KubeNodeLabelsCollector
  735. //--------------------------------------------------------------------------
  736. //
  737. // We use this to emit kube_node_labels with all of a node's labels, regardless
  738. // of the whitelist setting introduced in KSM v2. See
  739. // https://github.com/kubernetes/kube-state-metrics/issues/1270#issuecomment-712986441
  740. // KubeNodeLabelsCollector is a prometheus collector that generates
  741. // KubeNodeLabelsMetrics
  742. type KubeNodeLabelsCollector struct {
  743. KubeClusterCache clustercache.ClusterCache
  744. }
  745. // Describe sends the super-set of all possible descriptors of metrics
  746. // collected by this Collector.
  747. func (nsac KubeNodeLabelsCollector) Describe(ch chan<- *prometheus.Desc) {
  748. ch <- prometheus.NewDesc("kube_node_labels", "all labels for each node prefixed with label_", []string{}, nil)
  749. }
  750. // Collect is called by the Prometheus registry when collecting metrics.
  751. func (nsac KubeNodeLabelsCollector) Collect(ch chan<- prometheus.Metric) {
  752. nodes := nsac.KubeClusterCache.GetAllNodes()
  753. for _, node := range nodes {
  754. labelNames, labelValues := prom.KubePrependQualifierToLabels(node.GetLabels(), "label_")
  755. m := newKubeNodeLabelsMetric(
  756. node.GetName(),
  757. "kube_node_labels",
  758. labelNames,
  759. labelValues,
  760. )
  761. ch <- m
  762. }
  763. }
  764. //--------------------------------------------------------------------------
  765. // KubeNodeLabelsMetric
  766. //--------------------------------------------------------------------------
  767. // KubeNodeLabelsMetric is a prometheus.Metric used to encode
  768. // a duplicate of the deprecated kube-state-metrics metric
  769. // kube_node_labels
  770. type KubeNodeLabelsMetric struct {
  771. fqName string
  772. help string
  773. labelNames []string
  774. labelValues []string
  775. node string
  776. }
  777. // Creates a new KubeNodeLabelsMetric, implementation of prometheus.Metric
  778. func newKubeNodeLabelsMetric(node string, fqname string, labelNames []string, labelValues []string) KubeNodeLabelsMetric {
  779. return KubeNodeLabelsMetric{
  780. fqName: fqname,
  781. labelNames: labelNames,
  782. labelValues: labelValues,
  783. help: "kube_node_labels all labels for each node prefixed with label_",
  784. node: node,
  785. }
  786. }
  787. // Desc returns the descriptor for the Metric. This method idempotently
  788. // returns the same descriptor throughout the lifetime of the Metric.
  789. func (nam KubeNodeLabelsMetric) Desc() *prometheus.Desc {
  790. l := prometheus.Labels{
  791. "node": nam.node,
  792. }
  793. return prometheus.NewDesc(nam.fqName, nam.help, nam.labelNames, l)
  794. }
  795. // Write encodes the Metric into a "Metric" Protocol Buffer data
  796. // transmission object.
  797. func (nam KubeNodeLabelsMetric) Write(m *dto.Metric) error {
  798. h := float64(1)
  799. m.Gauge = &dto.Gauge{
  800. Value: &h,
  801. }
  802. var labels []*dto.LabelPair
  803. for i := range nam.labelNames {
  804. labels = append(labels, &dto.LabelPair{
  805. Name: &nam.labelNames[i],
  806. Value: &nam.labelValues[i],
  807. })
  808. }
  809. nodeString := "node"
  810. labels = append(labels, &dto.LabelPair{Name: &nodeString, Value: &nam.node})
  811. m.Label = labels
  812. return nil
  813. }
  814. // toStringPtr is used to create a new string pointer from iteration vars
  815. func toStringPtr(s string) *string {
  816. return &s
  817. }
  818. //--------------------------------------------------------------------------
  819. // Cost Model Metrics Initialization
  820. //--------------------------------------------------------------------------
  821. // Only allow the metrics to be instantiated and registered once
  822. var metricsInit sync.Once
  823. var (
  824. cpuGv *prometheus.GaugeVec
  825. ramGv *prometheus.GaugeVec
  826. gpuGv *prometheus.GaugeVec
  827. gpuCountGv *prometheus.GaugeVec
  828. pvGv *prometheus.GaugeVec
  829. spotGv *prometheus.GaugeVec
  830. totalGv *prometheus.GaugeVec
  831. ramAllocGv *prometheus.GaugeVec
  832. cpuAllocGv *prometheus.GaugeVec
  833. gpuAllocGv *prometheus.GaugeVec
  834. pvAllocGv *prometheus.GaugeVec
  835. networkZoneEgressCostG prometheus.Gauge
  836. networkRegionEgressCostG prometheus.Gauge
  837. networkInternetEgressCostG prometheus.Gauge
  838. clusterManagementCostGv *prometheus.GaugeVec
  839. lbCostGv *prometheus.GaugeVec
  840. )
  841. // initCostModelMetrics uses a sync.Once to ensure that these metrics are only created once
  842. func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider cloud.Provider) {
  843. metricsInit.Do(func() {
  844. cpuGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  845. Name: "node_cpu_hourly_cost",
  846. Help: "node_cpu_hourly_cost hourly cost for each cpu on this node",
  847. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  848. ramGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  849. Name: "node_ram_hourly_cost",
  850. Help: "node_ram_hourly_cost hourly cost for each gb of ram on this node",
  851. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  852. gpuGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  853. Name: "node_gpu_hourly_cost",
  854. Help: "node_gpu_hourly_cost hourly cost for each gpu on this node",
  855. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  856. gpuCountGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  857. Name: "node_gpu_count",
  858. Help: "node_gpu_count count of gpu on this node",
  859. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  860. pvGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  861. Name: "pv_hourly_cost",
  862. Help: "pv_hourly_cost Cost per GB per hour on a persistent disk",
  863. }, []string{"volumename", "persistentvolume", "provider_id"})
  864. spotGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  865. Name: "kubecost_node_is_spot",
  866. Help: "kubecost_node_is_spot Cloud provider info about node preemptibility",
  867. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  868. totalGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  869. Name: "node_total_hourly_cost",
  870. Help: "node_total_hourly_cost Total node cost per hour",
  871. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  872. ramAllocGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  873. Name: "container_memory_allocation_bytes",
  874. Help: "container_memory_allocation_bytes Bytes of RAM used",
  875. }, []string{"namespace", "pod", "container", "instance", "node"})
  876. cpuAllocGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  877. Name: "container_cpu_allocation",
  878. Help: "container_cpu_allocation Percent of a single CPU used in a minute",
  879. }, []string{"namespace", "pod", "container", "instance", "node"})
  880. gpuAllocGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  881. Name: "container_gpu_allocation",
  882. Help: "container_gpu_allocation GPU used",
  883. }, []string{"namespace", "pod", "container", "instance", "node"})
  884. pvAllocGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  885. Name: "pod_pvc_allocation",
  886. Help: "pod_pvc_allocation Bytes used by a PVC attached to a pod",
  887. }, []string{"namespace", "pod", "persistentvolumeclaim", "persistentvolume"})
  888. networkZoneEgressCostG = prometheus.NewGauge(prometheus.GaugeOpts{
  889. Name: "kubecost_network_zone_egress_cost",
  890. Help: "kubecost_network_zone_egress_cost Total cost per GB egress across zones",
  891. })
  892. networkRegionEgressCostG = prometheus.NewGauge(prometheus.GaugeOpts{
  893. Name: "kubecost_network_region_egress_cost",
  894. Help: "kubecost_network_region_egress_cost Total cost per GB egress across regions",
  895. })
  896. networkInternetEgressCostG = prometheus.NewGauge(prometheus.GaugeOpts{
  897. Name: "kubecost_network_internet_egress_cost",
  898. Help: "kubecost_network_internet_egress_cost Total cost per GB of internet egress.",
  899. })
  900. clusterManagementCostGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  901. Name: "kubecost_cluster_management_cost",
  902. Help: "kubecost_cluster_management_cost Hourly cost paid as a cluster management fee.",
  903. }, []string{"provisioner_name"})
  904. lbCostGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{ // no differentiation between ELB and ALB right now
  905. Name: "kubecost_load_balancer_cost",
  906. Help: "kubecost_load_balancer_cost Hourly cost of load balancer",
  907. }, []string{"ingress_ip", "namespace", "service_name"}) // assumes one ingress IP per load balancer
  908. // Register cost-model metrics for emission
  909. prometheus.MustRegister(cpuGv, ramGv, gpuGv, gpuCountGv, totalGv, pvGv, spotGv)
  910. prometheus.MustRegister(ramAllocGv, cpuAllocGv, gpuAllocGv, pvAllocGv)
  911. prometheus.MustRegister(networkZoneEgressCostG, networkRegionEgressCostG, networkInternetEgressCostG)
  912. prometheus.MustRegister(clusterManagementCostGv, lbCostGv)
  913. // General Metric Collectors
  914. prometheus.MustRegister(ServiceCollector{
  915. KubeClusterCache: clusterCache,
  916. })
  917. prometheus.MustRegister(DeploymentCollector{
  918. KubeClusterCache: clusterCache,
  919. })
  920. prometheus.MustRegister(StatefulsetCollector{
  921. KubeClusterCache: clusterCache,
  922. })
  923. prometheus.MustRegister(ClusterInfoCollector{
  924. KubeClientSet: clusterCache.GetClient(),
  925. Cloud: provider,
  926. })
  927. if env.IsEmitNamespaceAnnotationsMetric() {
  928. prometheus.MustRegister(NamespaceAnnotationCollector{
  929. KubeClusterCache: clusterCache,
  930. })
  931. }
  932. if env.IsEmitPodAnnotationsMetric() {
  933. prometheus.MustRegister(PodAnnotationCollector{
  934. KubeClusterCache: clusterCache,
  935. })
  936. }
  937. if env.IsEmitKsmV1Metrics() {
  938. prometheus.MustRegister(KubeNodeStatusCapacityMemoryBytesCollector{
  939. KubeClusterCache: clusterCache,
  940. })
  941. prometheus.MustRegister(KubeNodeStatusCapacityCPUCoresCollector{
  942. KubeClusterCache: clusterCache,
  943. })
  944. prometheus.MustRegister(KubePodLabelsCollector{
  945. KubeClusterCache: clusterCache,
  946. })
  947. prometheus.MustRegister(KubeNodeLabelsCollector{
  948. KubeClusterCache: clusterCache,
  949. })
  950. }
  951. })
  952. }
  953. //--------------------------------------------------------------------------
  954. // CostModelMetricsEmitter
  955. //--------------------------------------------------------------------------
  956. // CostModelMetricsEmitter emits all cost-model specific metrics calculated by
  957. // the CostModel.ComputeCostData() method.
  958. type CostModelMetricsEmitter struct {
  959. PrometheusClient promclient.Client
  960. KubeClusterCache clustercache.ClusterCache
  961. CloudProvider cloud.Provider
  962. Model *CostModel
  963. // Metrics
  964. CPUPriceRecorder *prometheus.GaugeVec
  965. RAMPriceRecorder *prometheus.GaugeVec
  966. PersistentVolumePriceRecorder *prometheus.GaugeVec
  967. GPUPriceRecorder *prometheus.GaugeVec
  968. GPUCountRecorder *prometheus.GaugeVec
  969. PVAllocationRecorder *prometheus.GaugeVec
  970. NodeSpotRecorder *prometheus.GaugeVec
  971. NodeTotalPriceRecorder *prometheus.GaugeVec
  972. RAMAllocationRecorder *prometheus.GaugeVec
  973. CPUAllocationRecorder *prometheus.GaugeVec
  974. GPUAllocationRecorder *prometheus.GaugeVec
  975. ClusterManagementCostRecorder *prometheus.GaugeVec
  976. LBCostRecorder *prometheus.GaugeVec
  977. NetworkZoneEgressRecorder prometheus.Gauge
  978. NetworkRegionEgressRecorder prometheus.Gauge
  979. NetworkInternetEgressRecorder prometheus.Gauge
  980. // Flow Control
  981. recordingLock *sync.Mutex
  982. recordingStopping bool
  983. recordingStop chan bool
  984. }
  985. // NewCostModelMetricsEmitter creates a new cost-model metrics emitter. Use Start() to begin metric emission.
  986. func NewCostModelMetricsEmitter(promClient promclient.Client, clusterCache clustercache.ClusterCache, provider cloud.Provider, model *CostModel) *CostModelMetricsEmitter {
  987. // init will only actually execute once to register the custom gauges
  988. initCostModelMetrics(clusterCache, provider)
  989. return &CostModelMetricsEmitter{
  990. PrometheusClient: promClient,
  991. KubeClusterCache: clusterCache,
  992. CloudProvider: provider,
  993. Model: model,
  994. CPUPriceRecorder: cpuGv,
  995. RAMPriceRecorder: ramGv,
  996. GPUPriceRecorder: gpuGv,
  997. GPUCountRecorder: gpuCountGv,
  998. PersistentVolumePriceRecorder: pvGv,
  999. NodeSpotRecorder: spotGv,
  1000. NodeTotalPriceRecorder: totalGv,
  1001. RAMAllocationRecorder: ramAllocGv,
  1002. CPUAllocationRecorder: cpuAllocGv,
  1003. GPUAllocationRecorder: gpuAllocGv,
  1004. PVAllocationRecorder: pvAllocGv,
  1005. NetworkZoneEgressRecorder: networkZoneEgressCostG,
  1006. NetworkRegionEgressRecorder: networkRegionEgressCostG,
  1007. NetworkInternetEgressRecorder: networkInternetEgressCostG,
  1008. ClusterManagementCostRecorder: clusterManagementCostGv,
  1009. LBCostRecorder: lbCostGv,
  1010. recordingLock: new(sync.Mutex),
  1011. recordingStopping: false,
  1012. recordingStop: nil,
  1013. }
  1014. }
  1015. // Checks to see if there is a metric recording stop channel. If it exists, a new
  1016. // channel is not created and false is returned. If it doesn't exist, a new channel
  1017. // is created and true is returned.
  1018. func (cmme *CostModelMetricsEmitter) checkOrCreateRecordingChan() bool {
  1019. cmme.recordingLock.Lock()
  1020. defer cmme.recordingLock.Unlock()
  1021. if cmme.recordingStop != nil {
  1022. return false
  1023. }
  1024. cmme.recordingStop = make(chan bool, 1)
  1025. return true
  1026. }
  1027. // IsRunning returns true if metric recording is running.
  1028. func (cmme *CostModelMetricsEmitter) IsRunning() bool {
  1029. cmme.recordingLock.Lock()
  1030. defer cmme.recordingLock.Unlock()
  1031. return cmme.recordingStop != nil
  1032. }
  1033. // StartCostModelMetricRecording starts the go routine that emits metrics used to determine
  1034. // cluster costs.
  1035. func (cmme *CostModelMetricsEmitter) Start() bool {
  1036. // Check to see if we're already recording
  1037. // This function will create the stop recording channel and return true
  1038. // if it doesn't exist.
  1039. if !cmme.checkOrCreateRecordingChan() {
  1040. log.Errorf("Attempted to start cost model metric recording when it's already running.")
  1041. return false
  1042. }
  1043. go func() {
  1044. defer errors.HandlePanic()
  1045. containerSeen := make(map[string]bool)
  1046. nodeSeen := make(map[string]bool)
  1047. loadBalancerSeen := make(map[string]bool)
  1048. pvSeen := make(map[string]bool)
  1049. pvcSeen := make(map[string]bool)
  1050. getKeyFromLabelStrings := func(labels ...string) string {
  1051. return strings.Join(labels, ",")
  1052. }
  1053. getLabelStringsFromKey := func(key string) []string {
  1054. return strings.Split(key, ",")
  1055. }
  1056. var defaultRegion string = ""
  1057. nodeList := cmme.KubeClusterCache.GetAllNodes()
  1058. if len(nodeList) > 0 {
  1059. var ok bool
  1060. defaultRegion, ok = util.GetRegion(nodeList[0].Labels)
  1061. if !ok {
  1062. log.DedupedWarningf(5, "Failed to locate default region")
  1063. }
  1064. }
  1065. for {
  1066. klog.V(4).Info("Recording prices...")
  1067. podlist := cmme.KubeClusterCache.GetAllPods()
  1068. podStatus := make(map[string]v1.PodPhase)
  1069. for _, pod := range podlist {
  1070. podStatus[pod.Name] = pod.Status.Phase
  1071. }
  1072. cfg, _ := cmme.CloudProvider.GetConfig()
  1073. provisioner, clusterManagementCost, err := cmme.CloudProvider.ClusterManagementPricing()
  1074. if err != nil {
  1075. klog.V(1).Infof("Error getting cluster management cost %s", err.Error())
  1076. }
  1077. cmme.ClusterManagementCostRecorder.WithLabelValues(provisioner).Set(clusterManagementCost)
  1078. // Record network pricing at global scope
  1079. networkCosts, err := cmme.CloudProvider.NetworkPricing()
  1080. if err != nil {
  1081. klog.V(4).Infof("Failed to retrieve network costs: %s", err.Error())
  1082. } else {
  1083. cmme.NetworkZoneEgressRecorder.Set(networkCosts.ZoneNetworkEgressCost)
  1084. cmme.NetworkRegionEgressRecorder.Set(networkCosts.RegionNetworkEgressCost)
  1085. cmme.NetworkInternetEgressRecorder.Set(networkCosts.InternetNetworkEgressCost)
  1086. }
  1087. // TODO: Pass PrometheusClient and CloudProvider into CostModel on instantiation so this isn't so awkward
  1088. data, err := cmme.Model.ComputeCostData(cmme.PrometheusClient, cmme.CloudProvider, "2m", "", "")
  1089. if err != nil {
  1090. // For an error collection, we'll just log the length of the errors (ComputeCostData already logs the
  1091. // actual errors)
  1092. if prom.IsErrorCollection(err) {
  1093. if ec, ok := err.(prom.QueryErrorCollection); ok {
  1094. log.Errorf("Error in price recording: %d errors occurred", len(ec.Errors()))
  1095. }
  1096. } else {
  1097. log.Errorf("Error in price recording: " + err.Error())
  1098. }
  1099. // zero the for loop so the time.Sleep will still work
  1100. data = map[string]*CostData{}
  1101. }
  1102. // TODO: Pass CloudProvider into CostModel on instantiation so this isn't so awkward
  1103. nodes, err := cmme.Model.GetNodeCost(cmme.CloudProvider)
  1104. for nodeName, node := range nodes {
  1105. // Emit costs, guarding against NaN inputs for custom pricing.
  1106. cpuCost, _ := strconv.ParseFloat(node.VCPUCost, 64)
  1107. if math.IsNaN(cpuCost) || math.IsInf(cpuCost, 0) {
  1108. cpuCost, _ = strconv.ParseFloat(cfg.CPU, 64)
  1109. if math.IsNaN(cpuCost) || math.IsInf(cpuCost, 0) {
  1110. cpuCost = 0
  1111. }
  1112. }
  1113. cpu, _ := strconv.ParseFloat(node.VCPU, 64)
  1114. if math.IsNaN(cpu) || math.IsInf(cpu, 0) {
  1115. cpu = 1 // Assume 1 CPU
  1116. }
  1117. ramCost, _ := strconv.ParseFloat(node.RAMCost, 64)
  1118. if math.IsNaN(ramCost) || math.IsInf(ramCost, 0) {
  1119. ramCost, _ = strconv.ParseFloat(cfg.RAM, 64)
  1120. if math.IsNaN(ramCost) || math.IsInf(ramCost, 0) {
  1121. ramCost = 0
  1122. }
  1123. }
  1124. ram, _ := strconv.ParseFloat(node.RAMBytes, 64)
  1125. if math.IsNaN(ram) || math.IsInf(ram, 0) {
  1126. ram = 0
  1127. }
  1128. gpu, _ := strconv.ParseFloat(node.GPU, 64)
  1129. if math.IsNaN(gpu) || math.IsInf(gpu, 0) {
  1130. gpu = 0
  1131. }
  1132. gpuCost, _ := strconv.ParseFloat(node.GPUCost, 64)
  1133. if math.IsNaN(gpuCost) || math.IsInf(gpuCost, 0) {
  1134. gpuCost, _ = strconv.ParseFloat(cfg.GPU, 64)
  1135. if math.IsNaN(gpuCost) || math.IsInf(gpuCost, 0) {
  1136. gpuCost = 0
  1137. }
  1138. }
  1139. nodeType := node.InstanceType
  1140. nodeRegion := node.Region
  1141. totalCost := cpu*cpuCost + ramCost*(ram/1024/1024/1024) + gpu*gpuCost
  1142. cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(cpuCost)
  1143. cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(ramCost)
  1144. cmme.GPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpuCost)
  1145. cmme.GPUCountRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpu)
  1146. cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(totalCost)
  1147. if node.IsSpot() {
  1148. cmme.NodeSpotRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(1.0)
  1149. } else {
  1150. cmme.NodeSpotRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(0.0)
  1151. }
  1152. labelKey := getKeyFromLabelStrings(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID)
  1153. nodeSeen[labelKey] = true
  1154. }
  1155. // TODO: Pass CloudProvider into CostModel on instantiation so this isn't so awkward
  1156. loadBalancers, err := cmme.Model.GetLBCost(cmme.CloudProvider)
  1157. for lbKey, lb := range loadBalancers {
  1158. // TODO: parse (if necessary) and calculate cost associated with loadBalancer based on dynamic cloud prices fetched into each lb struct on GetLBCost() call
  1159. keyParts := getLabelStringsFromKey(lbKey)
  1160. namespace := keyParts[0]
  1161. serviceName := keyParts[1]
  1162. ingressIP := ""
  1163. if len(lb.IngressIPAddresses) > 0 {
  1164. ingressIP = lb.IngressIPAddresses[0] // assumes one ingress IP per load balancer
  1165. }
  1166. cmme.LBCostRecorder.WithLabelValues(ingressIP, namespace, serviceName).Set(lb.Cost)
  1167. labelKey := getKeyFromLabelStrings(namespace, serviceName)
  1168. loadBalancerSeen[labelKey] = true
  1169. }
  1170. for _, costs := range data {
  1171. nodeName := costs.NodeName
  1172. namespace := costs.Namespace
  1173. podName := costs.PodName
  1174. containerName := costs.Name
  1175. if costs.PVCData != nil {
  1176. for _, pvc := range costs.PVCData {
  1177. if pvc.Volume != nil {
  1178. timesClaimed := pvc.TimesClaimed
  1179. if timesClaimed == 0 {
  1180. timesClaimed = 1 // unallocated PVs are unclaimed but have a full allocation
  1181. }
  1182. cmme.PVAllocationRecorder.WithLabelValues(namespace, podName, pvc.Claim, pvc.VolumeName).Set(pvc.Values[0].Value / float64(timesClaimed))
  1183. labelKey := getKeyFromLabelStrings(namespace, podName, pvc.Claim, pvc.VolumeName)
  1184. pvcSeen[labelKey] = true
  1185. }
  1186. }
  1187. }
  1188. if len(costs.RAMAllocation) > 0 {
  1189. cmme.RAMAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.RAMAllocation[0].Value)
  1190. }
  1191. if len(costs.CPUAllocation) > 0 {
  1192. cmme.CPUAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.CPUAllocation[0].Value)
  1193. }
  1194. if len(costs.GPUReq) > 0 {
  1195. // allocation here is set to the request because shared GPU usage not yet supported.
  1196. cmme.GPUAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.GPUReq[0].Value)
  1197. }
  1198. labelKey := getKeyFromLabelStrings(namespace, podName, containerName, nodeName, nodeName)
  1199. if podStatus[podName] == v1.PodRunning { // Only report data for current pods
  1200. containerSeen[labelKey] = true
  1201. } else {
  1202. containerSeen[labelKey] = false
  1203. }
  1204. }
  1205. storageClasses := cmme.KubeClusterCache.GetAllStorageClasses()
  1206. storageClassMap := make(map[string]map[string]string)
  1207. for _, storageClass := range storageClasses {
  1208. params := storageClass.Parameters
  1209. storageClassMap[storageClass.ObjectMeta.Name] = params
  1210. if storageClass.GetAnnotations()["storageclass.kubernetes.io/is-default-class"] == "true" || storageClass.GetAnnotations()["storageclass.beta.kubernetes.io/is-default-class"] == "true" {
  1211. storageClassMap["default"] = params
  1212. storageClassMap[""] = params
  1213. }
  1214. }
  1215. pvs := cmme.KubeClusterCache.GetAllPersistentVolumes()
  1216. for _, pv := range pvs {
  1217. parameters, ok := storageClassMap[pv.Spec.StorageClassName]
  1218. if !ok {
  1219. klog.V(4).Infof("Unable to find parameters for storage class \"%s\". Does pv \"%s\" have a storageClassName?", pv.Spec.StorageClassName, pv.Name)
  1220. }
  1221. var region string
  1222. if r, ok := util.GetRegion(pv.Labels); ok {
  1223. region = r
  1224. } else {
  1225. region = defaultRegion
  1226. }
  1227. cacPv := &cloud.PV{
  1228. Class: pv.Spec.StorageClassName,
  1229. Region: region,
  1230. Parameters: parameters,
  1231. }
  1232. // TODO: GetPVCost should be a method in CostModel?
  1233. GetPVCost(cacPv, pv, cmme.CloudProvider, region)
  1234. c, _ := strconv.ParseFloat(cacPv.Cost, 64)
  1235. cmme.PersistentVolumePriceRecorder.WithLabelValues(pv.Name, pv.Name, cacPv.ProviderID).Set(c)
  1236. labelKey := getKeyFromLabelStrings(pv.Name, pv.Name)
  1237. pvSeen[labelKey] = true
  1238. }
  1239. for labelString, seen := range nodeSeen {
  1240. if !seen {
  1241. klog.V(4).Infof("Removing %s from nodes", labelString)
  1242. labels := getLabelStringsFromKey(labelString)
  1243. ok := cmme.NodeTotalPriceRecorder.DeleteLabelValues(labels...)
  1244. if ok {
  1245. klog.V(4).Infof("removed %s from totalprice", labelString)
  1246. } else {
  1247. klog.Infof("FAILURE TO REMOVE %s from totalprice", labelString)
  1248. }
  1249. ok = cmme.NodeSpotRecorder.DeleteLabelValues(labels...)
  1250. if ok {
  1251. klog.V(4).Infof("removed %s from spot records", labelString)
  1252. } else {
  1253. klog.Infof("FAILURE TO REMOVE %s from spot records", labelString)
  1254. }
  1255. ok = cmme.CPUPriceRecorder.DeleteLabelValues(labels...)
  1256. if ok {
  1257. klog.V(4).Infof("removed %s from cpuprice", labelString)
  1258. } else {
  1259. klog.Infof("FAILURE TO REMOVE %s from cpuprice", labelString)
  1260. }
  1261. ok = cmme.GPUPriceRecorder.DeleteLabelValues(labels...)
  1262. if ok {
  1263. klog.V(4).Infof("removed %s from gpuprice", labelString)
  1264. } else {
  1265. klog.Infof("FAILURE TO REMOVE %s from gpuprice", labelString)
  1266. }
  1267. ok = cmme.GPUCountRecorder.DeleteLabelValues(labels...)
  1268. if ok {
  1269. klog.V(4).Infof("removed %s from gpucount", labelString)
  1270. } else {
  1271. klog.Infof("FAILURE TO REMOVE %s from gpucount", labelString)
  1272. }
  1273. ok = cmme.RAMPriceRecorder.DeleteLabelValues(labels...)
  1274. if ok {
  1275. klog.V(4).Infof("removed %s from ramprice", labelString)
  1276. } else {
  1277. klog.Infof("FAILURE TO REMOVE %s from ramprice", labelString)
  1278. }
  1279. delete(nodeSeen, labelString)
  1280. } else {
  1281. nodeSeen[labelString] = false
  1282. }
  1283. }
  1284. for labelString, seen := range loadBalancerSeen {
  1285. if !seen {
  1286. labels := getLabelStringsFromKey(labelString)
  1287. cmme.LBCostRecorder.DeleteLabelValues(labels...)
  1288. } else {
  1289. loadBalancerSeen[labelString] = false
  1290. }
  1291. }
  1292. for labelString, seen := range containerSeen {
  1293. if !seen {
  1294. labels := getLabelStringsFromKey(labelString)
  1295. cmme.RAMAllocationRecorder.DeleteLabelValues(labels...)
  1296. cmme.CPUAllocationRecorder.DeleteLabelValues(labels...)
  1297. cmme.GPUAllocationRecorder.DeleteLabelValues(labels...)
  1298. delete(containerSeen, labelString)
  1299. } else {
  1300. containerSeen[labelString] = false
  1301. }
  1302. }
  1303. for labelString, seen := range pvSeen {
  1304. if !seen {
  1305. labels := getLabelStringsFromKey(labelString)
  1306. cmme.PersistentVolumePriceRecorder.DeleteLabelValues(labels...)
  1307. delete(pvSeen, labelString)
  1308. } else {
  1309. pvSeen[labelString] = false
  1310. }
  1311. }
  1312. for labelString, seen := range pvcSeen {
  1313. if !seen {
  1314. labels := getLabelStringsFromKey(labelString)
  1315. cmme.PVAllocationRecorder.DeleteLabelValues(labels...)
  1316. delete(pvcSeen, labelString)
  1317. } else {
  1318. pvcSeen[labelString] = false
  1319. }
  1320. }
  1321. select {
  1322. case <-time.After(time.Minute):
  1323. case <-cmme.recordingStop:
  1324. cmme.recordingLock.Lock()
  1325. cmme.recordingStopping = false
  1326. cmme.recordingStop = nil
  1327. cmme.recordingLock.Unlock()
  1328. return
  1329. }
  1330. }
  1331. }()
  1332. return true
  1333. }
  1334. // Stop halts the metrics emission loop after the current emission is completed
  1335. // or if the emission is paused.
  1336. func (cmme *CostModelMetricsEmitter) Stop() {
  1337. cmme.recordingLock.Lock()
  1338. defer cmme.recordingLock.Unlock()
  1339. if !cmme.recordingStopping && cmme.recordingStop != nil {
  1340. cmme.recordingStopping = true
  1341. close(cmme.recordingStop)
  1342. }
  1343. }