2
0

dcgm_saturation.go 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. package kubemodel
  2. import (
  3. "fmt"
  4. "math"
  5. "maps"
  6. )
  7. // DCGMDeviceSaturation carries USE-method saturation signals for an NVIDIA
  8. // GPU recorded from the DCGM exporter: where utilization reports how busy
  9. // the device was, saturation reports work queued, rejected, or slowed
  10. // because the device could not service demand. It lives on DCGMDevice
  11. // because these signals are NVIDIA/DCGM-specific; the vendor-neutral
  12. // surface is the DeviceSaturation interface.
  13. //
  14. // Every field is an independent primitive; no composite score is computed.
  15. // A nil field means the underlying source metric was not available in the
  16. // window (no dcgm-exporter, field disabled in its config, or no DCP
  17. // profiling support), never that the value was zero. Ratios are fractions
  18. // of the observed window in [0, 1] unless noted.
  19. // @bingen:generate:DCGMDeviceSaturation
  20. type DCGMDeviceSaturation struct {
  21. // ThrottleViolationRatios maps a throttle reason (power, thermal,
  22. // sync_boost, board_limit) to the fraction of the window the device
  23. // spent throttled for that reason, from the cumulative
  24. // DCGM_FI_DEV_*_VIOLATION microsecond counters.
  25. ThrottleViolationRatios map[string]float64 `json:"throttleViolationRatios,omitempty"`
  26. // ThrottleReasonRatios maps a throttle reason (sw_power_cap,
  27. // hw_slowdown, sync_boost, sw_thermal, hw_thermal, hw_power_brake) to
  28. // the fraction of the window the corresponding bit of
  29. // DCGM_FI_DEV_CLOCK_THROTTLE_REASONS was set.
  30. ThrottleReasonRatios map[string]float64 `json:"throttleReasonRatios,omitempty"`
  31. // MemoryUsedRatioAvg/Max are framebuffer occupancy over the window:
  32. // FB_USED / (FB_USED + FB_FREE).
  33. MemoryUsedRatioAvg *float64 `json:"memoryUsedRatioAvg,omitempty"`
  34. MemoryUsedRatioMax *float64 `json:"memoryUsedRatioMax,omitempty"`
  35. // MemoryPressureRatio is the fraction of the window framebuffer
  36. // occupancy exceeded the configured threshold.
  37. MemoryPressureRatio *float64 `json:"memoryPressureRatio,omitempty"`
  38. // XIDErrorCount counts XID error events observed in the window.
  39. XIDErrorCount *float64 `json:"xidErrorCount,omitempty"`
  40. // DRAMActiveAvg/Max are the ratio of cycles the device memory interface
  41. // was active (DCP profiling).
  42. DRAMActiveAvg *float64 `json:"dramActiveAvg,omitempty"`
  43. DRAMActiveMax *float64 `json:"dramActiveMax,omitempty"`
  44. // SMActiveAvg and SMOccupancyAvg distinguish compute-bound from
  45. // bandwidth- or latency-bound saturation (DCP profiling).
  46. SMActiveAvg *float64 `json:"smActiveAvg,omitempty"`
  47. SMOccupancyAvg *float64 `json:"smOccupancyAvg,omitempty"`
  48. // Interconnect throughput in bytes/sec (DCP profiling). Link capacity
  49. // is not derivable from DCGM, so these are raw rates, not ratios.
  50. PCIeTxBytesAvg *float64 `json:"pcieTxBytesAvg,omitempty"`
  51. PCIeRxBytesAvg *float64 `json:"pcieRxBytesAvg,omitempty"`
  52. NVLinkTxBytesAvg *float64 `json:"nvlinkTxBytesAvg,omitempty"`
  53. NVLinkRxBytesAvg *float64 `json:"nvlinkRxBytesAvg,omitempty"`
  54. }
  55. // ratioFields returns name/value pairs for every field constrained to
  56. // [0, 1], for validation.
  57. func (s *DCGMDeviceSaturation) ratioFields() map[string]*float64 {
  58. return map[string]*float64{
  59. "MemoryUsedRatioAvg": s.MemoryUsedRatioAvg,
  60. "MemoryUsedRatioMax": s.MemoryUsedRatioMax,
  61. "MemoryPressureRatio": s.MemoryPressureRatio,
  62. "DRAMActiveAvg": s.DRAMActiveAvg,
  63. "DRAMActiveMax": s.DRAMActiveMax,
  64. "SMActiveAvg": s.SMActiveAvg,
  65. "SMOccupancyAvg": s.SMOccupancyAvg,
  66. }
  67. }
  68. // Validate validates the DCGMDeviceSaturation fields.
  69. func (s *DCGMDeviceSaturation) Validate() error {
  70. if s == nil {
  71. return nil
  72. }
  73. for _, m := range []map[string]float64{s.ThrottleViolationRatios, s.ThrottleReasonRatios} {
  74. for reason, ratio := range m {
  75. if math.IsNaN(ratio) || ratio < 0 || ratio > 1 {
  76. return fmt.Errorf("throttle ratio for %q must be 0-1, got %v", reason, ratio)
  77. }
  78. }
  79. }
  80. for name, value := range s.ratioFields() {
  81. if value != nil && (math.IsNaN(*value) || *value < 0 || *value > 1) {
  82. return fmt.Errorf("%s must be 0-1, got %v", name, *value)
  83. }
  84. }
  85. for name, value := range map[string]*float64{
  86. "XIDErrorCount": s.XIDErrorCount,
  87. "PCIeTxBytesAvg": s.PCIeTxBytesAvg,
  88. "PCIeRxBytesAvg": s.PCIeRxBytesAvg,
  89. "NVLinkTxBytesAvg": s.NVLinkTxBytesAvg,
  90. "NVLinkRxBytesAvg": s.NVLinkRxBytesAvg,
  91. } {
  92. if value != nil && (math.IsNaN(*value) || *value < 0) {
  93. return fmt.Errorf("%s cannot be negative, got %v", name, *value)
  94. }
  95. }
  96. return nil
  97. }
  98. // Clone creates a deep copy of the DCGMDeviceSaturation.
  99. func (s *DCGMDeviceSaturation) Clone() *DCGMDeviceSaturation {
  100. if s == nil {
  101. return nil
  102. }
  103. cloneFloat := func(v *float64) *float64 {
  104. if v == nil {
  105. return nil
  106. }
  107. c := *v
  108. return &c
  109. }
  110. return &DCGMDeviceSaturation{
  111. ThrottleViolationRatios: maps.Clone(s.ThrottleViolationRatios),
  112. ThrottleReasonRatios: maps.Clone(s.ThrottleReasonRatios),
  113. MemoryUsedRatioAvg: cloneFloat(s.MemoryUsedRatioAvg),
  114. MemoryUsedRatioMax: cloneFloat(s.MemoryUsedRatioMax),
  115. MemoryPressureRatio: cloneFloat(s.MemoryPressureRatio),
  116. XIDErrorCount: cloneFloat(s.XIDErrorCount),
  117. DRAMActiveAvg: cloneFloat(s.DRAMActiveAvg),
  118. DRAMActiveMax: cloneFloat(s.DRAMActiveMax),
  119. SMActiveAvg: cloneFloat(s.SMActiveAvg),
  120. SMOccupancyAvg: cloneFloat(s.SMOccupancyAvg),
  121. PCIeTxBytesAvg: cloneFloat(s.PCIeTxBytesAvg),
  122. PCIeRxBytesAvg: cloneFloat(s.PCIeRxBytesAvg),
  123. NVLinkTxBytesAvg: cloneFloat(s.NVLinkTxBytesAvg),
  124. NVLinkRxBytesAvg: cloneFloat(s.NVLinkRxBytesAvg),
  125. }
  126. }