metrics.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. package metric
  2. const (
  3. // Cluster Cache Metrics
  4. ClusterInfo = "cluster_info"
  5. NodeInfo = "node_info"
  6. NodeResourceCapacities = "node_resource_capacities"
  7. NodeResourcesAllocatable = "node_resources_allocatable"
  8. PodInfo = "pod_info"
  9. PodPVCVolume = "pod_pvc_volume"
  10. KubeNodeStatusCapacityCPUCores = "kube_node_status_capacity_cpu_cores"
  11. KubeNodeStatusCapacityMemoryBytes = "kube_node_status_capacity_memory_bytes"
  12. KubeNodeStatusAllocatableCPUCores = "kube_node_status_allocatable_cpu_cores"
  13. KubeNodeStatusAllocatableMemoryBytes = "kube_node_status_allocatable_memory_bytes"
  14. KubeNodeLabels = "kube_node_labels"
  15. KubePodLabels = "kube_pod_labels"
  16. KubePodAnnotations = "kube_pod_annotations"
  17. KubePodOwner = "kube_pod_owner"
  18. KubePodContainerStatusRunning = "kube_pod_container_status_running"
  19. KubePodContainerResourceRequests = "kube_pod_container_resource_requests"
  20. KubePodContainerResourceLimits = "kube_pod_container_resource_limits"
  21. KubePersistentVolumeClaimInfo = "kube_persistentvolumeclaim_info"
  22. KubePersistentVolumeClaimResourceRequestsStorageBytes = "kube_persistentvolumeclaim_resource_requests_storage_bytes"
  23. KubecostPVInfo = "kubecost_pv_info"
  24. KubePersistentVolumeCapacityBytes = "kube_persistentvolume_capacity_bytes"
  25. DeploymentInfo = "deployment_info"
  26. DeploymentLabels = "deployment_labels"
  27. DeploymentAnnotations = "deployment_annotations"
  28. DeploymentMatchLabels = "deployment_match_labels"
  29. StatefulSetInfo = "statefulset_info"
  30. StatefulSetLabels = "statefulset_labels"
  31. StatefulSetAnnotations = "statefulset_annotations"
  32. StatefulSetMatchLabels = "statefulSet_match_labels"
  33. DaemonSetInfo = "daemonset_info"
  34. DaemonSetLabels = "daemonset_labels"
  35. DaemonSetAnnotations = "daemonset_annotations"
  36. JobInfo = "job_info"
  37. JobLabels = "job_labels"
  38. JobAnnotations = "job_annotations"
  39. CronJobInfo = "cronjob_info"
  40. CronJobLabels = "cronjob_labels"
  41. CronJobAnnotations = "cronjob_annotations"
  42. ReplicaSetInfo = "replicaset_info"
  43. ReplicaSetLabels = "replicaset_labels"
  44. ReplicaSetAnnotations = "replicaset_annotations"
  45. NamespaceInfo = "namespace_info"
  46. KubeNamespaceLabels = "kube_namespace_labels"
  47. KubeNamespaceAnnotations = "kube_namespace_annotations"
  48. ServiceInfo = "service_info"
  49. ServiceSelectorLabels = "service_selector_labels"
  50. KubeReplicasetOwner = "kube_replicaset_owner"
  51. ContainerCPUAllocation = "container_cpu_allocation"
  52. ContainerMemoryAllocationBytes = "container_memory_allocation_bytes"
  53. ContainerGPUAllocation = "container_gpu_allocation"
  54. PodPVCAllocation = "pod_pvc_allocation"
  55. ResourceQuotaInfo = "resourcequota_info"
  56. KubeResourceQuotaSpecResourceRequests = "resourcequota_spec_resource_requests"
  57. KubeResourceQuotaSpecResourceLimits = "resourcequota_spec_resource_limits"
  58. KubeResourceQuotaStatusUsedResourceRequests = "resourcequota_status_used_resource_requests"
  59. KubeResourceQuotaStatusUsedResourceLimits = "resourcequota_status_used_resource_limits"
  60. // DCGM Metrics
  61. DCGMFIPROFGRENGINEACTIVE = "DCGM_FI_PROF_GR_ENGINE_ACTIVE"
  62. DCGMFIDEVDECUTIL = "DCGM_FI_DEV_DEC_UTIL"
  63. // DCGM saturation metrics (default dcgm-exporter configuration)
  64. DCGMFIDEVPOWERVIOLATION = "DCGM_FI_DEV_POWER_VIOLATION"
  65. DCGMFIDEVTHERMALVIOLATION = "DCGM_FI_DEV_THERMAL_VIOLATION"
  66. DCGMFIDEVSYNCBOOSTVIOLATION = "DCGM_FI_DEV_SYNC_BOOST_VIOLATION"
  67. DCGMFIDEVBOARDLIMITVIOLATION = "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION"
  68. DCGMFIDEVFBUSED = "DCGM_FI_DEV_FB_USED"
  69. DCGMFIDEVFBFREE = "DCGM_FI_DEV_FB_FREE"
  70. DCGMFIDEVXIDERRORS = "DCGM_FI_DEV_XID_ERRORS"
  71. DCGMFIDEVPOWERUSAGE = "DCGM_FI_DEV_POWER_USAGE"
  72. DCGMFIDEVGPUTEMP = "DCGM_FI_DEV_GPU_TEMP"
  73. // DCGM saturation metrics requiring explicit enablement in the
  74. // dcgm-exporter configuration. The clock throttle reasons bitmask was
  75. // renamed in DCGM 3.3+; both names are scraped, at most one exists.
  76. DCGMFIDEVCLOCKTHROTTLEREASONS = "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS"
  77. DCGMFIDEVCLOCKSEVENTREASONS = "DCGM_FI_DEV_CLOCKS_EVENT_REASONS"
  78. // DCGM DCP profiling saturation metrics (require Volta+ GPUs;
  79. // SM_ACTIVE, SM_OCCUPANCY, and NVLINK additionally require explicit
  80. // enablement in the dcgm-exporter configuration)
  81. DCGMFIPROFDRAMACTIVE = "DCGM_FI_PROF_DRAM_ACTIVE"
  82. DCGMFIPROFSMACTIVE = "DCGM_FI_PROF_SM_ACTIVE"
  83. DCGMFIPROFSMOCCUPANCY = "DCGM_FI_PROF_SM_OCCUPANCY"
  84. DCGMFIPROFPCIETXBYTES = "DCGM_FI_PROF_PCIE_TX_BYTES"
  85. DCGMFIPROFPCIERXBYTES = "DCGM_FI_PROF_PCIE_RX_BYTES"
  86. DCGMFIPROFNVLINKTXBYTES = "DCGM_FI_PROF_NVLINK_TX_BYTES"
  87. DCGMFIPROFNVLINKRXBYTES = "DCGM_FI_PROF_NVLINK_RX_BYTES"
  88. // Synthetic metrics generated from DCGM scrapes (see pkg/metric/synthetic)
  89. // OpencostGPUMemoryUsedRatio is the per-sample framebuffer occupancy
  90. // ratio FB_USED / (FB_USED + FB_FREE), joined per scrape
  91. OpencostGPUMemoryUsedRatio = "opencost_gpu_memory_used_ratio"
  92. // Network Metrics
  93. KubecostPodNetworkEgressBytesTotal = "kubecost_pod_network_egress_bytes_total"
  94. KubecostPodNetworkIngressBytesTotal = "kubecost_pod_network_ingress_bytes_total"
  95. // Opencost Metrics
  96. KubecostClusterManagementCost = "kubecost_cluster_management_cost"
  97. KubecostNetworkZoneEgressCost = "kubecost_network_zone_egress_cost"
  98. KubecostNetworkRegionEgressCost = "kubecost_network_region_egress_cost"
  99. KubecostNetworkInternetEgressCost = "kubecost_network_internet_egress_cost"
  100. KubecostNetworkNatGatewayEgressCost = "kubecost_network_nat_gateway_egress_cost"
  101. KubecostNetworkNatGatewayIngressCost = "kubecost_network_nat_gateway_ingress_cost"
  102. PVHourlyCost = "pv_hourly_cost"
  103. KubecostLoadBalancerCost = "kubecost_load_balancer_cost"
  104. NodeTotalHourlyCost = "node_total_hourly_cost"
  105. NodeCPUHourlyCost = "node_cpu_hourly_cost"
  106. NodeRAMHourlyCost = "node_ram_hourly_cost"
  107. NodeGPUHourlyCost = "node_gpu_hourly_cost"
  108. NodeGPUCount = "node_gpu_count"
  109. KubecostNodeIsSpot = "kubecost_node_is_spot"
  110. // Stat Summary Metrics
  111. NodeCPUSecondsTotal = "node_cpu_seconds_total"
  112. NodeFSCapacityBytes = "node_fs_capacity_bytes" // replaces container_fs_limit_bytes
  113. ContainerNetworkReceiveBytesTotal = "container_network_receive_bytes_total"
  114. ContainerNetworkTransmitBytesTotal = "container_network_transmit_bytes_total"
  115. ContainerCPUUsageSecondsTotal = "container_cpu_usage_seconds_total"
  116. ContainerMemoryWorkingSetBytes = "container_memory_working_set_bytes"
  117. ContainerFSUsageBytes = "container_fs_usage_bytes"
  118. KubeletVolumeStatsUsedBytes = "kubelet_volume_stats_used_bytes"
  119. )