metrics.go 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064
  1. package costmodel
  2. import (
  3. "math"
  4. "strconv"
  5. "strings"
  6. "sync"
  7. "time"
  8. "github.com/kubecost/cost-model/pkg/cloud"
  9. "github.com/kubecost/cost-model/pkg/clustercache"
  10. "github.com/kubecost/cost-model/pkg/errors"
  11. "github.com/kubecost/cost-model/pkg/log"
  12. "github.com/kubecost/cost-model/pkg/prom"
  13. promclient "github.com/prometheus/client_golang/api"
  14. "github.com/prometheus/client_golang/prometheus"
  15. dto "github.com/prometheus/client_model/go"
  16. v1 "k8s.io/api/core/v1"
  17. "k8s.io/client-go/kubernetes"
  18. "k8s.io/klog"
  19. )
  20. //--------------------------------------------------------------------------
  21. // StatefulsetCollector
  22. //--------------------------------------------------------------------------
  23. // StatefulsetCollector is a prometheus collector that generates StatefulsetMetrics
  24. type StatefulsetCollector struct {
  25. KubeClusterCache clustercache.ClusterCache
  26. }
  27. // Describe sends the super-set of all possible descriptors of metrics
  28. // collected by this Collector.
  29. func (sc StatefulsetCollector) Describe(ch chan<- *prometheus.Desc) {
  30. ch <- prometheus.NewDesc("statefulSet_match_labels", "statfulSet match labels", []string{}, nil)
  31. }
  32. // Collect is called by the Prometheus registry when collecting metrics.
  33. func (sc StatefulsetCollector) Collect(ch chan<- prometheus.Metric) {
  34. ds := sc.KubeClusterCache.GetAllStatefulSets()
  35. for _, statefulset := range ds {
  36. labels, values := prom.KubeLabelsToLabels(statefulset.Spec.Selector.MatchLabels)
  37. m := newStatefulsetMetric(statefulset.GetName(), statefulset.GetNamespace(), "statefulSet_match_labels", labels, values)
  38. ch <- m
  39. }
  40. }
  41. //--------------------------------------------------------------------------
  42. // StatefulsetMetric
  43. //--------------------------------------------------------------------------
  44. // StatefulsetMetric is a prometheus.Metric used to encode statefulset match labels
  45. type StatefulsetMetric struct {
  46. fqName string
  47. help string
  48. labelNames []string
  49. labelValues []string
  50. statefulsetName string
  51. namespace string
  52. }
  53. // Creates a new StatefulsetMetric, implementation of prometheus.Metric
  54. func newStatefulsetMetric(name, namespace, fqname string, labelNames []string, labelvalues []string) StatefulsetMetric {
  55. return StatefulsetMetric{
  56. fqName: fqname,
  57. labelNames: labelNames,
  58. labelValues: labelvalues,
  59. help: "statefulSet_match_labels StatefulSet Match Labels",
  60. statefulsetName: name,
  61. namespace: namespace,
  62. }
  63. }
  64. // Desc returns the descriptor for the Metric. This method idempotently
  65. // returns the same descriptor throughout the lifetime of the Metric.
  66. func (s StatefulsetMetric) Desc() *prometheus.Desc {
  67. l := prometheus.Labels{"statefulSet": s.statefulsetName, "namespace": s.namespace}
  68. return prometheus.NewDesc(s.fqName, s.help, s.labelNames, l)
  69. }
  70. // Write encodes the Metric into a "Metric" Protocol Buffer data
  71. // transmission object.
  72. func (s StatefulsetMetric) Write(m *dto.Metric) error {
  73. h := float64(1)
  74. m.Gauge = &dto.Gauge{
  75. Value: &h,
  76. }
  77. var labels []*dto.LabelPair
  78. for i := range s.labelNames {
  79. labels = append(labels, &dto.LabelPair{
  80. Name: &s.labelNames[i],
  81. Value: &s.labelValues[i],
  82. })
  83. }
  84. n := "namespace"
  85. labels = append(labels, &dto.LabelPair{
  86. Name: &n,
  87. Value: &s.namespace,
  88. })
  89. r := "statefulSet"
  90. labels = append(labels, &dto.LabelPair{
  91. Name: &r,
  92. Value: &s.statefulsetName,
  93. })
  94. m.Label = labels
  95. return nil
  96. }
  97. //--------------------------------------------------------------------------
  98. // DeploymentCollector
  99. //--------------------------------------------------------------------------
  100. // DeploymentCollector is a prometheus collector that generates DeploymentMetrics
  101. type DeploymentCollector struct {
  102. KubeClusterCache clustercache.ClusterCache
  103. }
  104. // Describe sends the super-set of all possible descriptors of metrics
  105. // collected by this Collector.
  106. func (sc DeploymentCollector) Describe(ch chan<- *prometheus.Desc) {
  107. ch <- prometheus.NewDesc("deployment_match_labels", "deployment match labels", []string{}, nil)
  108. }
  109. // Collect is called by the Prometheus registry when collecting metrics.
  110. func (sc DeploymentCollector) Collect(ch chan<- prometheus.Metric) {
  111. ds := sc.KubeClusterCache.GetAllDeployments()
  112. for _, deployment := range ds {
  113. labels, values := prom.KubeLabelsToLabels(deployment.Spec.Selector.MatchLabels)
  114. m := newDeploymentMetric(deployment.GetName(), deployment.GetNamespace(), "deployment_match_labels", labels, values)
  115. ch <- m
  116. }
  117. }
  118. //--------------------------------------------------------------------------
  119. // DeploymentMetric
  120. //--------------------------------------------------------------------------
  121. // DeploymentMetric is a prometheus.Metric used to encode deployment match labels
  122. type DeploymentMetric struct {
  123. fqName string
  124. help string
  125. labelNames []string
  126. labelValues []string
  127. deploymentName string
  128. namespace string
  129. }
  130. // Creates a new DeploymentMetric, implementation of prometheus.Metric
  131. func newDeploymentMetric(name, namespace, fqname string, labelNames []string, labelvalues []string) DeploymentMetric {
  132. return DeploymentMetric{
  133. fqName: fqname,
  134. labelNames: labelNames,
  135. labelValues: labelvalues,
  136. help: "deployment_match_labels Deployment Match Labels",
  137. deploymentName: name,
  138. namespace: namespace,
  139. }
  140. }
  141. // Desc returns the descriptor for the Metric. This method idempotently
  142. // returns the same descriptor throughout the lifetime of the Metric.
  143. func (s DeploymentMetric) Desc() *prometheus.Desc {
  144. l := prometheus.Labels{"deployment": s.deploymentName, "namespace": s.namespace}
  145. return prometheus.NewDesc(s.fqName, s.help, s.labelNames, l)
  146. }
  147. // Write encodes the Metric into a "Metric" Protocol Buffer data
  148. // transmission object.
  149. func (s DeploymentMetric) Write(m *dto.Metric) error {
  150. h := float64(1)
  151. m.Gauge = &dto.Gauge{
  152. Value: &h,
  153. }
  154. var labels []*dto.LabelPair
  155. for i := range s.labelNames {
  156. labels = append(labels, &dto.LabelPair{
  157. Name: &s.labelNames[i],
  158. Value: &s.labelValues[i],
  159. })
  160. }
  161. n := "namespace"
  162. labels = append(labels, &dto.LabelPair{
  163. Name: &n,
  164. Value: &s.namespace,
  165. })
  166. r := "deployment"
  167. labels = append(labels, &dto.LabelPair{
  168. Name: &r,
  169. Value: &s.deploymentName,
  170. })
  171. m.Label = labels
  172. return nil
  173. }
  174. //--------------------------------------------------------------------------
  175. // ServiceCollector
  176. //--------------------------------------------------------------------------
  177. // ServiceCollector is a prometheus collector that generates ServiceMetrics
  178. type ServiceCollector struct {
  179. KubeClusterCache clustercache.ClusterCache
  180. }
  181. // Describe sends the super-set of all possible descriptors of metrics
  182. // collected by this Collector.
  183. func (sc ServiceCollector) Describe(ch chan<- *prometheus.Desc) {
  184. ch <- prometheus.NewDesc("service_selector_labels", "service selector labels", []string{}, nil)
  185. }
  186. // Collect is called by the Prometheus registry when collecting metrics.
  187. func (sc ServiceCollector) Collect(ch chan<- prometheus.Metric) {
  188. svcs := sc.KubeClusterCache.GetAllServices()
  189. for _, svc := range svcs {
  190. labels, values := prom.KubeLabelsToLabels(svc.Spec.Selector)
  191. m := newServiceMetric(svc.GetName(), svc.GetNamespace(), "service_selector_labels", labels, values)
  192. ch <- m
  193. }
  194. }
  195. //--------------------------------------------------------------------------
  196. // ServiceMetric
  197. //--------------------------------------------------------------------------
  198. // ServiceMetric is a prometheus.Metric used to encode service selector labels
  199. type ServiceMetric struct {
  200. fqName string
  201. help string
  202. labelNames []string
  203. labelValues []string
  204. serviceName string
  205. namespace string
  206. }
  207. // Creates a new ServiceMetric, implementation of prometheus.Metric
  208. func newServiceMetric(name, namespace, fqname string, labelNames []string, labelvalues []string) ServiceMetric {
  209. return ServiceMetric{
  210. fqName: fqname,
  211. labelNames: labelNames,
  212. labelValues: labelvalues,
  213. help: "service_selector_labels Service Selector Labels",
  214. serviceName: name,
  215. namespace: namespace,
  216. }
  217. }
  218. // Desc returns the descriptor for the Metric. This method idempotently
  219. // returns the same descriptor throughout the lifetime of the Metric.
  220. func (s ServiceMetric) Desc() *prometheus.Desc {
  221. l := prometheus.Labels{"service": s.serviceName, "namespace": s.namespace}
  222. return prometheus.NewDesc(s.fqName, s.help, s.labelNames, l)
  223. }
  224. // Write encodes the Metric into a "Metric" Protocol Buffer data
  225. // transmission object.
  226. func (s ServiceMetric) Write(m *dto.Metric) error {
  227. h := float64(1)
  228. m.Gauge = &dto.Gauge{
  229. Value: &h,
  230. }
  231. var labels []*dto.LabelPair
  232. for i := range s.labelNames {
  233. labels = append(labels, &dto.LabelPair{
  234. Name: &s.labelNames[i],
  235. Value: &s.labelValues[i],
  236. })
  237. }
  238. n := "namespace"
  239. labels = append(labels, &dto.LabelPair{
  240. Name: &n,
  241. Value: &s.namespace,
  242. })
  243. r := "service"
  244. labels = append(labels, &dto.LabelPair{
  245. Name: &r,
  246. Value: &s.serviceName,
  247. })
  248. m.Label = labels
  249. return nil
  250. }
  251. //--------------------------------------------------------------------------
  252. // NamespaceAnnotationCollector
  253. //--------------------------------------------------------------------------
  254. // NamespaceAnnotationCollector is a prometheus collector that generates NamespaceAnnotationMetrics
  255. type NamespaceAnnotationCollector struct {
  256. KubeClusterCache clustercache.ClusterCache
  257. }
  258. // Describe sends the super-set of all possible descriptors of metrics
  259. // collected by this Collector.
  260. func (nsac NamespaceAnnotationCollector) Describe(ch chan<- *prometheus.Desc) {
  261. ch <- prometheus.NewDesc("kube_namespace_annotations", "namespace annotations", []string{}, nil)
  262. }
  263. // Collect is called by the Prometheus registry when collecting metrics.
  264. func (nsac NamespaceAnnotationCollector) Collect(ch chan<- prometheus.Metric) {
  265. namespaces := nsac.KubeClusterCache.GetAllNamespaces()
  266. for _, namespace := range namespaces {
  267. labels, values := prom.KubeAnnotationsToLabels(namespace.Annotations)
  268. m := newNamespaceAnnotationsMetric(namespace.GetName(), "kube_namespace_annotations", labels, values)
  269. ch <- m
  270. }
  271. }
  272. //--------------------------------------------------------------------------
  273. // NamespaceAnnotationsMetric
  274. //--------------------------------------------------------------------------
  275. // NamespaceAnnotationsMetric is a prometheus.Metric used to encode namespace annotations
  276. type NamespaceAnnotationsMetric struct {
  277. fqName string
  278. help string
  279. labelNames []string
  280. labelValues []string
  281. namespace string
  282. }
  283. // Creates a new NamespaceAnnotationsMetric, implementation of prometheus.Metric
  284. func newNamespaceAnnotationsMetric(namespace, fqname string, labelNames []string, labelValues []string) NamespaceAnnotationsMetric {
  285. return NamespaceAnnotationsMetric{
  286. namespace: namespace,
  287. fqName: fqname,
  288. labelNames: labelNames,
  289. labelValues: labelValues,
  290. help: "kube_namespace_annotations Namespace Annotations",
  291. }
  292. }
  293. // Desc returns the descriptor for the Metric. This method idempotently
  294. // returns the same descriptor throughout the lifetime of the Metric.
  295. func (nam NamespaceAnnotationsMetric) Desc() *prometheus.Desc {
  296. l := prometheus.Labels{"namespace": nam.namespace}
  297. return prometheus.NewDesc(nam.fqName, nam.help, nam.labelNames, l)
  298. }
  299. // Write encodes the Metric into a "Metric" Protocol Buffer data
  300. // transmission object.
  301. func (nam NamespaceAnnotationsMetric) Write(m *dto.Metric) error {
  302. h := float64(1)
  303. m.Gauge = &dto.Gauge{
  304. Value: &h,
  305. }
  306. var labels []*dto.LabelPair
  307. for i := range nam.labelNames {
  308. labels = append(labels, &dto.LabelPair{
  309. Name: &nam.labelNames[i],
  310. Value: &nam.labelValues[i],
  311. })
  312. }
  313. n := "namespace"
  314. labels = append(labels, &dto.LabelPair{
  315. Name: &n,
  316. Value: &nam.namespace,
  317. })
  318. m.Label = labels
  319. return nil
  320. }
  321. //--------------------------------------------------------------------------
  322. // PodAnnotationCollector
  323. //--------------------------------------------------------------------------
  324. // PodAnnotationCollector is a prometheus collector that generates PodAnnotationMetrics
  325. type PodAnnotationCollector struct {
  326. KubeClusterCache clustercache.ClusterCache
  327. }
  328. // Describe sends the super-set of all possible descriptors of metrics
  329. // collected by this Collector.
  330. func (pac PodAnnotationCollector) Describe(ch chan<- *prometheus.Desc) {
  331. ch <- prometheus.NewDesc("kube_pod_annotations", "pod annotations", []string{}, nil)
  332. }
  333. // Collect is called by the Prometheus registry when collecting metrics.
  334. func (pac PodAnnotationCollector) Collect(ch chan<- prometheus.Metric) {
  335. pods := pac.KubeClusterCache.GetAllPods()
  336. for _, pod := range pods {
  337. labels, values := prom.KubeAnnotationsToLabels(pod.Annotations)
  338. m := newPodAnnotationMetric(pod.GetNamespace(), pod.GetName(), "kube_pod_annotations", labels, values)
  339. ch <- m
  340. }
  341. }
  342. //--------------------------------------------------------------------------
  343. // PodAnnotationsMetric
  344. //--------------------------------------------------------------------------
  345. // PodAnnotationsMetric is a prometheus.Metric used to encode namespace annotations
  346. type PodAnnotationsMetric struct {
  347. name string
  348. fqName string
  349. help string
  350. labelNames []string
  351. labelValues []string
  352. namespace string
  353. }
  354. // Creates a new PodAnnotationsMetric, implementation of prometheus.Metric
  355. func newPodAnnotationMetric(namespace, name, fqname string, labelNames []string, labelValues []string) PodAnnotationsMetric {
  356. return PodAnnotationsMetric{
  357. namespace: namespace,
  358. fqName: fqname,
  359. labelNames: labelNames,
  360. labelValues: labelValues,
  361. help: "kube_pod_annotations Pod Annotations",
  362. }
  363. }
  364. // Desc returns the descriptor for the Metric. This method idempotently
  365. // returns the same descriptor throughout the lifetime of the Metric.
  366. func (pam PodAnnotationsMetric) Desc() *prometheus.Desc {
  367. l := prometheus.Labels{"namespace": pam.namespace, "pod": pam.name}
  368. return prometheus.NewDesc(pam.fqName, pam.help, pam.labelNames, l)
  369. }
  370. // Write encodes the Metric into a "Metric" Protocol Buffer data
  371. // transmission object.
  372. func (pam PodAnnotationsMetric) Write(m *dto.Metric) error {
  373. h := float64(1)
  374. m.Gauge = &dto.Gauge{
  375. Value: &h,
  376. }
  377. var labels []*dto.LabelPair
  378. for i := range pam.labelNames {
  379. labels = append(labels, &dto.LabelPair{
  380. Name: &pam.labelNames[i],
  381. Value: &pam.labelValues[i],
  382. })
  383. }
  384. n := "namespace"
  385. labels = append(labels, &dto.LabelPair{
  386. Name: &n,
  387. Value: &pam.namespace,
  388. })
  389. r := "pod"
  390. labels = append(labels, &dto.LabelPair{
  391. Name: &r,
  392. Value: &pam.name,
  393. })
  394. m.Label = labels
  395. return nil
  396. }
  397. //--------------------------------------------------------------------------
  398. // ClusterInfoCollector
  399. //--------------------------------------------------------------------------
  400. // ClusterInfoCollector is a prometheus collector that generates ClusterInfoMetrics
  401. type ClusterInfoCollector struct {
  402. Cloud cloud.Provider
  403. KubeClientSet kubernetes.Interface
  404. }
  405. // Describe sends the super-set of all possible descriptors of metrics
  406. // collected by this Collector.
  407. func (cic ClusterInfoCollector) Describe(ch chan<- *prometheus.Desc) {
  408. ch <- prometheus.NewDesc("kubecost_cluster_info", "Kubecost Cluster Info", []string{}, nil)
  409. }
  410. // Collect is called by the Prometheus registry when collecting metrics.
  411. func (cic ClusterInfoCollector) Collect(ch chan<- prometheus.Metric) {
  412. clusterInfo := GetClusterInfo(cic.KubeClientSet, cic.Cloud)
  413. labels := prom.MapToLabels(clusterInfo)
  414. m := newClusterInfoMetric("kubecost_cluster_info", labels)
  415. ch <- m
  416. }
  417. //--------------------------------------------------------------------------
  418. // ClusterInfoMetric
  419. //--------------------------------------------------------------------------
  420. // ClusterInfoMetric is a prometheus.Metric used to encode the local cluster info
  421. type ClusterInfoMetric struct {
  422. fqName string
  423. help string
  424. labels map[string]string
  425. }
  426. // Creates a new ClusterInfoMetric, implementation of prometheus.Metric
  427. func newClusterInfoMetric(fqName string, labels map[string]string) ClusterInfoMetric {
  428. return ClusterInfoMetric{
  429. fqName: fqName,
  430. labels: labels,
  431. help: "kubecost_cluster_info ClusterInfo",
  432. }
  433. }
  434. // Desc returns the descriptor for the Metric. This method idempotently
  435. // returns the same descriptor throughout the lifetime of the Metric.
  436. func (cim ClusterInfoMetric) Desc() *prometheus.Desc {
  437. l := prometheus.Labels{}
  438. return prometheus.NewDesc(cim.fqName, cim.help, prom.LabelNamesFrom(cim.labels), l)
  439. }
  440. // Write encodes the Metric into a "Metric" Protocol Buffer data
  441. // transmission object.
  442. func (cim ClusterInfoMetric) Write(m *dto.Metric) error {
  443. h := float64(1)
  444. m.Gauge = &dto.Gauge{
  445. Value: &h,
  446. }
  447. var labels []*dto.LabelPair
  448. for k, v := range cim.labels {
  449. labels = append(labels, &dto.LabelPair{
  450. Name: toStringPtr(k),
  451. Value: toStringPtr(v),
  452. })
  453. }
  454. m.Label = labels
  455. return nil
  456. }
  457. // toStringPtr is used to create a new string pointer from iteration vars
  458. func toStringPtr(s string) *string {
  459. return &s
  460. }
  461. //--------------------------------------------------------------------------
  462. // Cost Model Metrics Initialization
  463. //--------------------------------------------------------------------------
  464. // Only allow the metrics to be instantiated and registered once
  465. var metricsInit sync.Once
  466. var (
  467. cpuGv *prometheus.GaugeVec
  468. ramGv *prometheus.GaugeVec
  469. gpuGv *prometheus.GaugeVec
  470. pvGv *prometheus.GaugeVec
  471. spotGv *prometheus.GaugeVec
  472. totalGv *prometheus.GaugeVec
  473. ramAllocGv *prometheus.GaugeVec
  474. cpuAllocGv *prometheus.GaugeVec
  475. gpuAllocGv *prometheus.GaugeVec
  476. pvAllocGv *prometheus.GaugeVec
  477. networkZoneEgressCostG prometheus.Gauge
  478. networkRegionEgressCostG prometheus.Gauge
  479. networkInternetEgressCostG prometheus.Gauge
  480. clusterManagementCostGv *prometheus.GaugeVec
  481. lbCostGv *prometheus.GaugeVec
  482. )
  483. // initCostModelMetrics uses a sync.Once to ensure that these metrics are only created once
  484. func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider cloud.Provider) {
  485. metricsInit.Do(func() {
  486. cpuGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  487. Name: "node_cpu_hourly_cost",
  488. Help: "node_cpu_hourly_cost hourly cost for each cpu on this node",
  489. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  490. ramGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  491. Name: "node_ram_hourly_cost",
  492. Help: "node_ram_hourly_cost hourly cost for each gb of ram on this node",
  493. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  494. gpuGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  495. Name: "node_gpu_hourly_cost",
  496. Help: "node_gpu_hourly_cost hourly cost for each gpu on this node",
  497. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  498. pvGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  499. Name: "pv_hourly_cost",
  500. Help: "pv_hourly_cost Cost per GB per hour on a persistent disk",
  501. }, []string{"volumename", "persistentvolume", "provider_id"})
  502. spotGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  503. Name: "kubecost_node_is_spot",
  504. Help: "kubecost_node_is_spot Cloud provider info about node preemptibility",
  505. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  506. totalGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  507. Name: "node_total_hourly_cost",
  508. Help: "node_total_hourly_cost Total node cost per hour",
  509. }, []string{"instance", "node", "instance_type", "region", "provider_id"})
  510. ramAllocGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  511. Name: "container_memory_allocation_bytes",
  512. Help: "container_memory_allocation_bytes Bytes of RAM used",
  513. }, []string{"namespace", "pod", "container", "instance", "node"})
  514. cpuAllocGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  515. Name: "container_cpu_allocation",
  516. Help: "container_cpu_allocation Percent of a single CPU used in a minute",
  517. }, []string{"namespace", "pod", "container", "instance", "node"})
  518. gpuAllocGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  519. Name: "container_gpu_allocation",
  520. Help: "container_gpu_allocation GPU used",
  521. }, []string{"namespace", "pod", "container", "instance", "node"})
  522. pvAllocGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  523. Name: "pod_pvc_allocation",
  524. Help: "pod_pvc_allocation Bytes used by a PVC attached to a pod",
  525. }, []string{"namespace", "pod", "persistentvolumeclaim", "persistentvolume"})
  526. networkZoneEgressCostG = prometheus.NewGauge(prometheus.GaugeOpts{
  527. Name: "kubecost_network_zone_egress_cost",
  528. Help: "kubecost_network_zone_egress_cost Total cost per GB egress across zones",
  529. })
  530. networkRegionEgressCostG = prometheus.NewGauge(prometheus.GaugeOpts{
  531. Name: "kubecost_network_region_egress_cost",
  532. Help: "kubecost_network_region_egress_cost Total cost per GB egress across regions",
  533. })
  534. networkInternetEgressCostG = prometheus.NewGauge(prometheus.GaugeOpts{
  535. Name: "kubecost_network_internet_egress_cost",
  536. Help: "kubecost_network_internet_egress_cost Total cost per GB of internet egress.",
  537. })
  538. clusterManagementCostGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
  539. Name: "kubecost_cluster_management_cost",
  540. Help: "kubecost_cluster_management_cost Hourly cost paid as a cluster management fee.",
  541. }, []string{"provisioner_name"})
  542. lbCostGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{ // no differentiation between ELB and ALB right now
  543. Name: "kubecost_load_balancer_cost",
  544. Help: "kubecost_load_balancer_cost Hourly cost of load balancer",
  545. }, []string{"ingress_ip", "namespace", "service_name"}) // assumes one ingress IP per load balancer
  546. // Register cost-model metrics for emission
  547. prometheus.MustRegister(cpuGv, ramGv, gpuGv, totalGv, pvGv, spotGv)
  548. prometheus.MustRegister(ramAllocGv, cpuAllocGv, gpuAllocGv, pvAllocGv)
  549. prometheus.MustRegister(networkZoneEgressCostG, networkRegionEgressCostG, networkInternetEgressCostG)
  550. prometheus.MustRegister(clusterManagementCostGv, lbCostGv)
  551. // General Metric Collectors
  552. prometheus.MustRegister(ServiceCollector{
  553. KubeClusterCache: clusterCache,
  554. })
  555. prometheus.MustRegister(DeploymentCollector{
  556. KubeClusterCache: clusterCache,
  557. })
  558. prometheus.MustRegister(StatefulsetCollector{
  559. KubeClusterCache: clusterCache,
  560. })
  561. prometheus.MustRegister(ClusterInfoCollector{
  562. KubeClientSet: clusterCache.GetClient(),
  563. Cloud: provider,
  564. })
  565. })
  566. }
  567. //--------------------------------------------------------------------------
  568. // CostModelMetricsEmitter
  569. //--------------------------------------------------------------------------
  570. // CostModelMetricsEmitter emits all cost-model specific metrics calculated by
  571. // the CostModel.ComputeCostData() method.
  572. type CostModelMetricsEmitter struct {
  573. PrometheusClient promclient.Client
  574. KubeClusterCache clustercache.ClusterCache
  575. CloudProvider cloud.Provider
  576. Model *CostModel
  577. // Metrics
  578. CPUPriceRecorder *prometheus.GaugeVec
  579. RAMPriceRecorder *prometheus.GaugeVec
  580. PersistentVolumePriceRecorder *prometheus.GaugeVec
  581. GPUPriceRecorder *prometheus.GaugeVec
  582. PVAllocationRecorder *prometheus.GaugeVec
  583. NodeSpotRecorder *prometheus.GaugeVec
  584. NodeTotalPriceRecorder *prometheus.GaugeVec
  585. RAMAllocationRecorder *prometheus.GaugeVec
  586. CPUAllocationRecorder *prometheus.GaugeVec
  587. GPUAllocationRecorder *prometheus.GaugeVec
  588. ClusterManagementCostRecorder *prometheus.GaugeVec
  589. LBCostRecorder *prometheus.GaugeVec
  590. NetworkZoneEgressRecorder prometheus.Gauge
  591. NetworkRegionEgressRecorder prometheus.Gauge
  592. NetworkInternetEgressRecorder prometheus.Gauge
  593. // Flow Control
  594. recordingLock *sync.Mutex
  595. recordingStopping bool
  596. recordingStop chan bool
  597. }
  598. // NewCostModelMetricsEmitter creates a new cost-model metrics emitter. Use Start() to begin metric emission.
  599. func NewCostModelMetricsEmitter(promClient promclient.Client, clusterCache clustercache.ClusterCache, provider cloud.Provider, model *CostModel) *CostModelMetricsEmitter {
  600. // init will only actually execute once to register the custom gauges
  601. initCostModelMetrics(clusterCache, provider)
  602. return &CostModelMetricsEmitter{
  603. PrometheusClient: promClient,
  604. KubeClusterCache: clusterCache,
  605. CloudProvider: provider,
  606. Model: model,
  607. CPUPriceRecorder: cpuGv,
  608. RAMPriceRecorder: ramGv,
  609. GPUPriceRecorder: gpuGv,
  610. PersistentVolumePriceRecorder: pvGv,
  611. NodeSpotRecorder: spotGv,
  612. NodeTotalPriceRecorder: totalGv,
  613. RAMAllocationRecorder: ramAllocGv,
  614. CPUAllocationRecorder: cpuAllocGv,
  615. GPUAllocationRecorder: gpuAllocGv,
  616. PVAllocationRecorder: pvAllocGv,
  617. NetworkZoneEgressRecorder: networkZoneEgressCostG,
  618. NetworkRegionEgressRecorder: networkRegionEgressCostG,
  619. NetworkInternetEgressRecorder: networkInternetEgressCostG,
  620. ClusterManagementCostRecorder: clusterManagementCostGv,
  621. LBCostRecorder: lbCostGv,
  622. recordingLock: new(sync.Mutex),
  623. recordingStopping: false,
  624. recordingStop: nil,
  625. }
  626. }
  627. // Checks to see if there is a metric recording stop channel. If it exists, a new
  628. // channel is not created and false is returned. If it doesn't exist, a new channel
  629. // is created and true is returned.
  630. func (cmme *CostModelMetricsEmitter) checkOrCreateRecordingChan() bool {
  631. cmme.recordingLock.Lock()
  632. defer cmme.recordingLock.Unlock()
  633. if cmme.recordingStop != nil {
  634. return false
  635. }
  636. cmme.recordingStop = make(chan bool, 1)
  637. return true
  638. }
  639. // IsRunning returns true if metric recording is running.
  640. func (cmme *CostModelMetricsEmitter) IsRunning() bool {
  641. cmme.recordingLock.Lock()
  642. defer cmme.recordingLock.Unlock()
  643. return cmme.recordingStop != nil
  644. }
  645. // StartCostModelMetricRecording starts the go routine that emits metrics used to determine
  646. // cluster costs.
  647. func (cmme *CostModelMetricsEmitter) Start() bool {
  648. // Check to see if we're already recording
  649. // This function will create the stop recording channel and return true
  650. // if it doesn't exist.
  651. if !cmme.checkOrCreateRecordingChan() {
  652. log.Errorf("Attempted to start cost model metric recording when it's already running.")
  653. return false
  654. }
  655. go func() {
  656. defer errors.HandlePanic()
  657. containerSeen := make(map[string]bool)
  658. nodeSeen := make(map[string]bool)
  659. loadBalancerSeen := make(map[string]bool)
  660. pvSeen := make(map[string]bool)
  661. pvcSeen := make(map[string]bool)
  662. getKeyFromLabelStrings := func(labels ...string) string {
  663. return strings.Join(labels, ",")
  664. }
  665. getLabelStringsFromKey := func(key string) []string {
  666. return strings.Split(key, ",")
  667. }
  668. var defaultRegion string = ""
  669. nodeList := cmme.KubeClusterCache.GetAllNodes()
  670. if len(nodeList) > 0 {
  671. defaultRegion = nodeList[0].Labels[v1.LabelZoneRegion]
  672. }
  673. for {
  674. klog.V(4).Info("Recording prices...")
  675. podlist := cmme.KubeClusterCache.GetAllPods()
  676. podStatus := make(map[string]v1.PodPhase)
  677. for _, pod := range podlist {
  678. podStatus[pod.Name] = pod.Status.Phase
  679. }
  680. cfg, _ := cmme.CloudProvider.GetConfig()
  681. provisioner, clusterManagementCost, err := cmme.CloudProvider.ClusterManagementPricing()
  682. if err != nil {
  683. klog.V(1).Infof("Error getting cluster management cost %s", err.Error())
  684. }
  685. cmme.ClusterManagementCostRecorder.WithLabelValues(provisioner).Set(clusterManagementCost)
  686. // Record network pricing at global scope
  687. networkCosts, err := cmme.CloudProvider.NetworkPricing()
  688. if err != nil {
  689. klog.V(4).Infof("Failed to retrieve network costs: %s", err.Error())
  690. } else {
  691. cmme.NetworkZoneEgressRecorder.Set(networkCosts.ZoneNetworkEgressCost)
  692. cmme.NetworkRegionEgressRecorder.Set(networkCosts.RegionNetworkEgressCost)
  693. cmme.NetworkInternetEgressRecorder.Set(networkCosts.InternetNetworkEgressCost)
  694. }
  695. // TODO: Pass PrometheusClient and CloudProvider into CostModel on instantiation so this isn't so awkward
  696. data, err := cmme.Model.ComputeCostData(cmme.PrometheusClient, cmme.CloudProvider, "2m", "", "")
  697. if err != nil {
  698. // For an error collection, we'll just log the length of the errors (ComputeCostData already logs the
  699. // actual errors)
  700. if prom.IsErrorCollection(err) {
  701. if ec, ok := err.(prom.QueryErrorCollection); ok {
  702. klog.V(1).Info("Error in price recording: %d errors occurred", len(ec.Errors()))
  703. }
  704. } else {
  705. klog.V(1).Info("Error in price recording: " + err.Error())
  706. }
  707. // zero the for loop so the time.Sleep will still work
  708. data = map[string]*CostData{}
  709. }
  710. // TODO: Pass CloudProvider into CostModel on instantiation so this isn't so awkward
  711. nodes, err := cmme.Model.GetNodeCost(cmme.CloudProvider)
  712. for nodeName, node := range nodes {
  713. // Emit costs, guarding against NaN inputs for custom pricing.
  714. cpuCost, _ := strconv.ParseFloat(node.VCPUCost, 64)
  715. if math.IsNaN(cpuCost) || math.IsInf(cpuCost, 0) {
  716. cpuCost, _ = strconv.ParseFloat(cfg.CPU, 64)
  717. if math.IsNaN(cpuCost) || math.IsInf(cpuCost, 0) {
  718. cpuCost = 0
  719. }
  720. }
  721. cpu, _ := strconv.ParseFloat(node.VCPU, 64)
  722. if math.IsNaN(cpu) || math.IsInf(cpu, 0) {
  723. cpu = 1 // Assume 1 CPU
  724. }
  725. ramCost, _ := strconv.ParseFloat(node.RAMCost, 64)
  726. if math.IsNaN(ramCost) || math.IsInf(ramCost, 0) {
  727. ramCost, _ = strconv.ParseFloat(cfg.RAM, 64)
  728. if math.IsNaN(ramCost) || math.IsInf(ramCost, 0) {
  729. ramCost = 0
  730. }
  731. }
  732. ram, _ := strconv.ParseFloat(node.RAMBytes, 64)
  733. if math.IsNaN(ram) || math.IsInf(ram, 0) {
  734. ram = 0
  735. }
  736. gpu, _ := strconv.ParseFloat(node.GPU, 64)
  737. if math.IsNaN(gpu) || math.IsInf(gpu, 0) {
  738. gpu = 0
  739. }
  740. gpuCost, _ := strconv.ParseFloat(node.GPUCost, 64)
  741. if math.IsNaN(gpuCost) || math.IsInf(gpuCost, 0) {
  742. gpuCost, _ = strconv.ParseFloat(cfg.GPU, 64)
  743. if math.IsNaN(gpuCost) || math.IsInf(gpuCost, 0) {
  744. gpuCost = 0
  745. }
  746. }
  747. nodeType := node.InstanceType
  748. nodeRegion := node.Region
  749. totalCost := cpu*cpuCost + ramCost*(ram/1024/1024/1024) + gpu*gpuCost
  750. cmme.CPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(cpuCost)
  751. cmme.RAMPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(ramCost)
  752. cmme.GPUPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(gpuCost)
  753. cmme.NodeTotalPriceRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(totalCost)
  754. if node.IsSpot() {
  755. cmme.NodeSpotRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(1.0)
  756. } else {
  757. cmme.NodeSpotRecorder.WithLabelValues(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID).Set(0.0)
  758. }
  759. labelKey := getKeyFromLabelStrings(nodeName, nodeName, nodeType, nodeRegion, node.ProviderID)
  760. nodeSeen[labelKey] = true
  761. }
  762. // TODO: Pass CloudProvider into CostModel on instantiation so this isn't so awkward
  763. loadBalancers, err := cmme.Model.GetLBCost(cmme.CloudProvider)
  764. for lbKey, lb := range loadBalancers {
  765. // TODO: parse (if necessary) and calculate cost associated with loadBalancer based on dynamic cloud prices fetched into each lb struct on GetLBCost() call
  766. keyParts := getLabelStringsFromKey(lbKey)
  767. namespace := keyParts[0]
  768. serviceName := keyParts[1]
  769. ingressIP := ""
  770. if len(lb.IngressIPAddresses) > 0 {
  771. ingressIP = lb.IngressIPAddresses[0] // assumes one ingress IP per load balancer
  772. }
  773. cmme.LBCostRecorder.WithLabelValues(ingressIP, namespace, serviceName).Set(lb.Cost)
  774. labelKey := getKeyFromLabelStrings(namespace, serviceName)
  775. loadBalancerSeen[labelKey] = true
  776. }
  777. for _, costs := range data {
  778. nodeName := costs.NodeName
  779. namespace := costs.Namespace
  780. podName := costs.PodName
  781. containerName := costs.Name
  782. if costs.PVCData != nil {
  783. for _, pvc := range costs.PVCData {
  784. if pvc.Volume != nil {
  785. timesClaimed := pvc.TimesClaimed
  786. if timesClaimed == 0 {
  787. timesClaimed = 1 // unallocated PVs are unclaimed but have a full allocation
  788. }
  789. cmme.PVAllocationRecorder.WithLabelValues(namespace, podName, pvc.Claim, pvc.VolumeName).Set(pvc.Values[0].Value / float64(timesClaimed))
  790. labelKey := getKeyFromLabelStrings(namespace, podName, pvc.Claim, pvc.VolumeName)
  791. pvcSeen[labelKey] = true
  792. }
  793. }
  794. }
  795. if len(costs.RAMAllocation) > 0 {
  796. cmme.RAMAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.RAMAllocation[0].Value)
  797. }
  798. if len(costs.CPUAllocation) > 0 {
  799. cmme.CPUAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.CPUAllocation[0].Value)
  800. }
  801. if len(costs.GPUReq) > 0 {
  802. // allocation here is set to the request because shared GPU usage not yet supported.
  803. cmme.GPUAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.GPUReq[0].Value)
  804. }
  805. labelKey := getKeyFromLabelStrings(namespace, podName, containerName, nodeName, nodeName)
  806. if podStatus[podName] == v1.PodRunning { // Only report data for current pods
  807. containerSeen[labelKey] = true
  808. } else {
  809. containerSeen[labelKey] = false
  810. }
  811. storageClasses := cmme.KubeClusterCache.GetAllStorageClasses()
  812. storageClassMap := make(map[string]map[string]string)
  813. for _, storageClass := range storageClasses {
  814. params := storageClass.Parameters
  815. storageClassMap[storageClass.ObjectMeta.Name] = params
  816. if storageClass.GetAnnotations()["storageclass.kubernetes.io/is-default-class"] == "true" || storageClass.GetAnnotations()["storageclass.beta.kubernetes.io/is-default-class"] == "true" {
  817. storageClassMap["default"] = params
  818. storageClassMap[""] = params
  819. }
  820. }
  821. pvs := cmme.KubeClusterCache.GetAllPersistentVolumes()
  822. for _, pv := range pvs {
  823. parameters, ok := storageClassMap[pv.Spec.StorageClassName]
  824. if !ok {
  825. klog.V(4).Infof("Unable to find parameters for storage class \"%s\". Does pv \"%s\" have a storageClassName?", pv.Spec.StorageClassName, pv.Name)
  826. }
  827. var region string
  828. if r, ok := pv.Labels[v1.LabelZoneRegion]; ok {
  829. region = r
  830. } else {
  831. region = defaultRegion
  832. }
  833. cacPv := &cloud.PV{
  834. Class: pv.Spec.StorageClassName,
  835. Region: region,
  836. Parameters: parameters,
  837. }
  838. // TODO: GetPVCost should be a method in CostModel?
  839. GetPVCost(cacPv, pv, cmme.CloudProvider, region)
  840. c, _ := strconv.ParseFloat(cacPv.Cost, 64)
  841. cmme.PersistentVolumePriceRecorder.WithLabelValues(pv.Name, pv.Name, cacPv.ProviderID).Set(c)
  842. labelKey := getKeyFromLabelStrings(pv.Name, pv.Name)
  843. pvSeen[labelKey] = true
  844. }
  845. }
  846. for labelString, seen := range nodeSeen {
  847. if !seen {
  848. klog.V(4).Infof("Removing %s from nodes", labelString)
  849. labels := getLabelStringsFromKey(labelString)
  850. ok := cmme.NodeTotalPriceRecorder.DeleteLabelValues(labels...)
  851. if ok {
  852. klog.V(4).Infof("removed %s from totalprice", labelString)
  853. } else {
  854. klog.Infof("FAILURE TO REMOVE %s from totalprice", labelString)
  855. }
  856. ok = cmme.NodeSpotRecorder.DeleteLabelValues(labels...)
  857. if ok {
  858. klog.V(4).Infof("removed %s from spot records", labelString)
  859. } else {
  860. klog.Infof("FAILURE TO REMOVE %s from spot records", labelString)
  861. }
  862. ok = cmme.CPUPriceRecorder.DeleteLabelValues(labels...)
  863. if ok {
  864. klog.V(4).Infof("removed %s from cpuprice", labelString)
  865. } else {
  866. klog.Infof("FAILURE TO REMOVE %s from cpuprice", labelString)
  867. }
  868. ok = cmme.GPUPriceRecorder.DeleteLabelValues(labels...)
  869. if ok {
  870. klog.V(4).Infof("removed %s from gpuprice", labelString)
  871. } else {
  872. klog.Infof("FAILURE TO REMOVE %s from gpuprice", labelString)
  873. }
  874. ok = cmme.RAMPriceRecorder.DeleteLabelValues(labels...)
  875. if ok {
  876. klog.V(4).Infof("removed %s from ramprice", labelString)
  877. } else {
  878. klog.Infof("FAILURE TO REMOVE %s from ramprice", labelString)
  879. }
  880. delete(nodeSeen, labelString)
  881. } else {
  882. nodeSeen[labelString] = false
  883. }
  884. }
  885. for labelString, seen := range loadBalancerSeen {
  886. if !seen {
  887. labels := getLabelStringsFromKey(labelString)
  888. cmme.LBCostRecorder.DeleteLabelValues(labels...)
  889. } else {
  890. loadBalancerSeen[labelString] = false
  891. }
  892. }
  893. for labelString, seen := range containerSeen {
  894. if !seen {
  895. labels := getLabelStringsFromKey(labelString)
  896. cmme.RAMAllocationRecorder.DeleteLabelValues(labels...)
  897. cmme.CPUAllocationRecorder.DeleteLabelValues(labels...)
  898. cmme.GPUAllocationRecorder.DeleteLabelValues(labels...)
  899. delete(containerSeen, labelString)
  900. } else {
  901. containerSeen[labelString] = false
  902. }
  903. }
  904. for labelString, seen := range pvSeen {
  905. if !seen {
  906. labels := getLabelStringsFromKey(labelString)
  907. cmme.PersistentVolumePriceRecorder.DeleteLabelValues(labels...)
  908. delete(pvSeen, labelString)
  909. } else {
  910. pvSeen[labelString] = false
  911. }
  912. }
  913. for labelString, seen := range pvcSeen {
  914. if !seen {
  915. labels := getLabelStringsFromKey(labelString)
  916. cmme.PVAllocationRecorder.DeleteLabelValues(labels...)
  917. delete(pvcSeen, labelString)
  918. } else {
  919. pvcSeen[labelString] = false
  920. }
  921. }
  922. select {
  923. case <-time.After(time.Minute):
  924. case <-cmme.recordingStop:
  925. cmme.recordingLock.Lock()
  926. cmme.recordingStopping = false
  927. cmme.recordingStop = nil
  928. cmme.recordingLock.Unlock()
  929. return
  930. }
  931. }
  932. }()
  933. return true
  934. }
  935. // Stop halts the metrics emission loop after the current emission is completed
  936. // or if the emission is paused.
  937. func (cmme *CostModelMetricsEmitter) Stop() {
  938. cmme.recordingLock.Lock()
  939. defer cmme.recordingLock.Unlock()
  940. if !cmme.recordingStopping && cmme.recordingStop != nil {
  941. cmme.recordingStopping = true
  942. close(cmme.recordingStop)
  943. }
  944. }