dcgm.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. package kubemodel
  2. import (
  3. "fmt"
  4. "time"
  5. "maps"
  6. )
  7. // DCGMDevice holds recording from the DCGM exporter which provides identification and usage metrics for
  8. // Nvidia gpu. These Nvidia devices can be incorporated into the cluster via k8s Device Plugin API or DRAs.
  9. // While the DCGM exporter does provide unique identifiers for the containers that it is reporting metrics on,
  10. // It is split out here to provide some isolate from the rest of the KubeModel which represent universal structures
  11. // from the k8s API. It is left to the end user to interpret the relationships to the rest of the cluster based on
  12. // container unique identifiers
  13. // @bingen:generate:DCGMDevice
  14. type DCGMDevice struct {
  15. UUID string `json:"uuid"`
  16. Start time.Time `json:"start"`
  17. End time.Time `json:"end"`
  18. Device string `json:"device"`
  19. ModelName string `json:"modelName"`
  20. PodUsages map[string]DCGMPod `json:"podUsages"`
  21. // USE-method saturation signals for this device, nil when no
  22. // saturation metrics were available (see DCGMDeviceSaturation)
  23. Saturation *DCGMDeviceSaturation `json:"saturation,omitempty"` // @bingen:field[version=3]
  24. // Device-level metrics from the default dcgm-exporter configuration.
  25. // Compute utilization is a percentage (0-100); memory used is absolute
  26. // bytes. Zero means not collected for the window.
  27. PowerWatts float64 `json:"powerWatts,omitempty"` // @bingen:field[version=4]
  28. TemperatureCelsius float64 `json:"temperatureCelsius,omitempty"` // @bingen:field[version=4]
  29. ComputeUtilizationAvg float64 `json:"computeUtilizationAvg,omitempty"` // @bingen:field[version=4]
  30. ComputeUtilizationMax float64 `json:"computeUtilizationMax,omitempty"` // @bingen:field[version=4]
  31. MemoryUsedBytesAvg float64 `json:"memoryUsedBytesAvg,omitempty"` // @bingen:field[version=4]
  32. MemoryUsedBytesMax float64 `json:"memoryUsedBytesMax,omitempty"` // @bingen:field[version=4]
  33. }
  34. var (
  35. _ DeviceInfo = (*DCGMDevice)(nil)
  36. _ DevicePerformance = (*DCGMDevice)(nil)
  37. _ DeviceSaturation = (*DCGMDevice)(nil)
  38. )
  39. // DeviceInfo implementation. MIG parentage is not derivable from
  40. // dcgm-exporter labels (a MIG instance's series do not carry the physical
  41. // GPU UUID), so GetParent reports empty until a source for the mapping
  42. // exists (e.g. DRA slice attributes).
  43. func (d *DCGMDevice) GetIdentifier() string { return d.UUID }
  44. func (d *DCGMDevice) GetType() string { return "GPU" }
  45. func (d *DCGMDevice) GetName() string { return d.ModelName }
  46. func (d *DCGMDevice) GetPower() float64 { return d.PowerWatts }
  47. func (d *DCGMDevice) GetStart() time.Time { return d.Start }
  48. func (d *DCGMDevice) GetEnd() time.Time { return d.End }
  49. func (d *DCGMDevice) GetParent() string { return "" }
  50. // DevicePerformance implementation, backed by device-level DCGM series:
  51. // GR_ENGINE_ACTIVE (scaled to percent), FB_USED (scaled to bytes), and
  52. // GPU_TEMP.
  53. func (d *DCGMDevice) GetComputeUtilizationAverage() float64 { return d.ComputeUtilizationAvg }
  54. func (d *DCGMDevice) GetComputeUtilizationMax() float64 { return d.ComputeUtilizationMax }
  55. func (d *DCGMDevice) GetMemoryUtilizationAverage() float64 { return d.MemoryUsedBytesAvg }
  56. func (d *DCGMDevice) GetMemoryUtilizationMax() float64 { return d.MemoryUsedBytesMax }
  57. func (d *DCGMDevice) GetTemp() float64 { return d.TemperatureCelsius }
  58. // DeviceSaturation implementation. The vendor-neutral getters map onto
  59. // DCGM concepts: throttle violation counters DCGM_FI_DEV_*_VIOLATION,
  60. // throttle reason bitmask DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, framebuffer
  61. // occupancy FB_USED/(FB_USED+FB_FREE), error events = XID errors, memory
  62. // bandwidth = DRAM_ACTIVE, compute active/occupancy = SM_ACTIVE /
  63. // SM_OCCUPANCY, host link = PCIe, peer link = NVLink. ok=false / nil means
  64. // the DCGM field was unavailable, never zero.
  65. // saturationValue adapts the saturation struct's pointer fields to the
  66. // interface's (value, ok) presence contract, including when Saturation is
  67. // entirely nil.
  68. func (d *DCGMDevice) saturationValue(get func(*DCGMDeviceSaturation) *float64) (float64, bool) {
  69. if d.Saturation == nil {
  70. return 0, false
  71. }
  72. v := get(d.Saturation)
  73. if v == nil {
  74. return 0, false
  75. }
  76. return *v, true
  77. }
  78. func (d *DCGMDevice) GetThrottleViolationRatios() map[string]float64 {
  79. if d.Saturation == nil {
  80. return nil
  81. }
  82. return maps.Clone(d.Saturation.ThrottleViolationRatios)
  83. }
  84. func (d *DCGMDevice) GetThrottleReasonRatios() map[string]float64 {
  85. if d.Saturation == nil {
  86. return nil
  87. }
  88. return maps.Clone(d.Saturation.ThrottleReasonRatios)
  89. }
  90. func (d *DCGMDevice) GetMemoryUsedRatioAvg() (float64, bool) {
  91. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryUsedRatioAvg })
  92. }
  93. func (d *DCGMDevice) GetMemoryUsedRatioMax() (float64, bool) {
  94. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryUsedRatioMax })
  95. }
  96. func (d *DCGMDevice) GetMemoryPressureRatio() (float64, bool) {
  97. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryPressureRatio })
  98. }
  99. func (d *DCGMDevice) GetErrorEventCount() (float64, bool) {
  100. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.XIDErrorCount })
  101. }
  102. func (d *DCGMDevice) GetMemoryBandwidthActiveAvg() (float64, bool) {
  103. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.DRAMActiveAvg })
  104. }
  105. func (d *DCGMDevice) GetMemoryBandwidthActiveMax() (float64, bool) {
  106. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.DRAMActiveMax })
  107. }
  108. func (d *DCGMDevice) GetComputeActiveAvg() (float64, bool) {
  109. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.SMActiveAvg })
  110. }
  111. func (d *DCGMDevice) GetComputeOccupancyAvg() (float64, bool) {
  112. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.SMOccupancyAvg })
  113. }
  114. func (d *DCGMDevice) GetHostLinkTxBytesAvg() (float64, bool) {
  115. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.PCIeTxBytesAvg })
  116. }
  117. func (d *DCGMDevice) GetHostLinkRxBytesAvg() (float64, bool) {
  118. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.PCIeRxBytesAvg })
  119. }
  120. func (d *DCGMDevice) GetPeerLinkTxBytesAvg() (float64, bool) {
  121. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.NVLinkTxBytesAvg })
  122. }
  123. func (d *DCGMDevice) GetPeerLinkRxBytesAvg() (float64, bool) {
  124. return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.NVLinkRxBytesAvg })
  125. }
  126. // @bingen:generate:DCGMPod
  127. type DCGMPod struct {
  128. ContainerUsages map[string]DCGMContainer `json:"container-usages"`
  129. }
  130. // @bingen:generate:DCGMContainer
  131. type DCGMContainer struct {
  132. UsageAvg float64 `json:"usageAvg"`
  133. UsageMax float64 `json:"usageMax"`
  134. }
  135. func (d *DCGMDevice) ValidateDCGMDevice(window Window) error {
  136. if d.UUID == "" {
  137. return fmt.Errorf("UUID is missing for DCGMDevice with device '%s'", d.Device)
  138. }
  139. if err := checkWindow(window, d.Start, d.End); err != nil {
  140. return err
  141. }
  142. if err := d.Saturation.Validate(); err != nil {
  143. return fmt.Errorf("invalid Saturation for DCGMDevice '%s': %w", d.UUID, err)
  144. }
  145. return nil
  146. }
  147. // RegisterDCGMDevice validates and adds a DCGMDevice to the set, keyed by UUID.
  148. func (kms *KubeModelSet) RegisterDCGMDevice(device *DCGMDevice) error {
  149. if err := device.ValidateDCGMDevice(kms.Window); err != nil {
  150. err = fmt.Errorf("RegisterDCGMDevice: invalid dcgm device: %w", err)
  151. kms.Error(err)
  152. return err
  153. }
  154. if _, ok := kms.DCGMDevices[device.UUID]; !ok {
  155. if kms.Cluster == nil {
  156. kms.Warnf("RegisterDCGMDevice: Cluster is nil")
  157. }
  158. kms.DCGMDevices[device.UUID] = device
  159. kms.Metadata.ObjectCount++
  160. }
  161. return nil
  162. }