gpu_saturation.go 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. package opencost
  2. import (
  3. "maps"
  4. "math"
  5. )
  6. // GPU throttle reason bits as reported by the DCGM field
  7. // DCGM_FI_DEV_CLOCK_THROTTLE_REASONS (renamed DCGM_FI_DEV_CLOCKS_EVENT_REASONS
  8. // in DCGM 3.3+). The bit positions are defined by NVML's
  9. // nvmlClocksThrottleReasons constants:
  10. // https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksThrottleReasons.html
  11. //
  12. // Only saturation-relevant bits are enumerated here. The remaining bits
  13. // (gpu_idle 0x1, applications_clocks_setting 0x2, display_clock_setting 0x100)
  14. // describe configured operating states rather than the GPU being unable to
  15. // service demand, so they are intentionally excluded.
  16. const (
  17. GPUThrottleBitSwPowerCap uint64 = 0x4 // clocks reduced by software power cap
  18. GPUThrottleBitHwSlowdown uint64 = 0x8 // hardware slowdown (thermal or power brake) engaged
  19. GPUThrottleBitSyncBoost uint64 = 0x10 // clocks lowered to match a sync-boost group
  20. GPUThrottleBitSwThermal uint64 = 0x20 // software thermal slowdown
  21. GPUThrottleBitHwThermal uint64 = 0x40 // hardware thermal slowdown
  22. GPUThrottleBitHwPowerBrake uint64 = 0x80 // hardware power brake slowdown
  23. )
  24. // Canonical names for saturation-relevant GPU throttle reasons. These are the
  25. // keys used in GPUSaturation.ThrottleReasonRatios.
  26. const (
  27. GPUThrottleReasonSwPowerCap = "sw_power_cap"
  28. GPUThrottleReasonHwSlowdown = "hw_slowdown"
  29. GPUThrottleReasonSyncBoost = "sync_boost"
  30. GPUThrottleReasonSwThermal = "sw_thermal"
  31. GPUThrottleReasonHwThermal = "hw_thermal"
  32. GPUThrottleReasonHwPowerBrake = "hw_power_brake"
  33. )
  34. // Canonical names for GPU throttle violation counters reported by DCGM. These
  35. // are the keys used in GPUSaturation.ThrottleViolationRatios. Each maps to a
  36. // cumulative microsecond counter in the default dcgm-exporter configuration:
  37. // DCGM_FI_DEV_POWER_VIOLATION, DCGM_FI_DEV_THERMAL_VIOLATION,
  38. // DCGM_FI_DEV_SYNC_BOOST_VIOLATION, and DCGM_FI_DEV_BOARD_LIMIT_VIOLATION.
  39. const (
  40. GPUThrottleViolationPower = "power"
  41. GPUThrottleViolationThermal = "thermal"
  42. GPUThrottleViolationSyncBoost = "sync_boost"
  43. GPUThrottleViolationBoardLimit = "board_limit"
  44. )
  45. // GPUThrottleReason pairs a canonical throttle reason name with its bit in the
  46. // DCGM clock throttle reasons bitmask.
  47. type GPUThrottleReason struct {
  48. Name string
  49. Bit uint64
  50. }
  51. // GPUThrottleReasons enumerates every saturation-relevant throttle reason, in
  52. // ascending bit order. It is the single source of truth for bitmask decoding
  53. // and for generating per-reason Prometheus queries.
  54. var GPUThrottleReasons = []GPUThrottleReason{
  55. {Name: GPUThrottleReasonSwPowerCap, Bit: GPUThrottleBitSwPowerCap},
  56. {Name: GPUThrottleReasonHwSlowdown, Bit: GPUThrottleBitHwSlowdown},
  57. {Name: GPUThrottleReasonSyncBoost, Bit: GPUThrottleBitSyncBoost},
  58. {Name: GPUThrottleReasonSwThermal, Bit: GPUThrottleBitSwThermal},
  59. {Name: GPUThrottleReasonHwThermal, Bit: GPUThrottleBitHwThermal},
  60. {Name: GPUThrottleReasonHwPowerBrake, Bit: GPUThrottleBitHwPowerBrake},
  61. }
  62. // GPUThrottleReasonsFromMask decodes a DCGM clock throttle reasons bitmask
  63. // into the canonical names of the active saturation-relevant reasons.
  64. // Non-saturation bits are ignored. A zero mask decodes to an empty slice.
  65. func GPUThrottleReasonsFromMask(mask uint64) []string {
  66. reasons := make([]string, 0, len(GPUThrottleReasons))
  67. for _, reason := range GPUThrottleReasons {
  68. if mask&reason.Bit != 0 {
  69. reasons = append(reasons, reason.Name)
  70. }
  71. }
  72. return reasons
  73. }
  74. // GPUSaturation carries per-GPU saturation signals derived from DCGM exporter
  75. // metrics, following the USE method: where utilization reports how busy the
  76. // GPU was, saturation reports work that was queued, rejected, or slowed
  77. // because the GPU could not service demand.
  78. //
  79. // Every field is an independent primitive; no composite score is computed.
  80. // A nil field means the underlying DCGM metric was not available in the
  81. // window (no dcgm-exporter, field disabled in its config, or no DCP
  82. // profiling support), never that the value was zero.
  83. //
  84. // Ratios are fractions of the queried window in [0, 1] unless noted.
  85. type GPUSaturation struct {
  86. // ThrottleViolationRatios maps a GPUThrottleViolation* name to the
  87. // fraction of the window the GPU spent throttled for that reason,
  88. // derived from the cumulative DCGM_FI_DEV_*_VIOLATION microsecond
  89. // counters. These counters are part of the default dcgm-exporter
  90. // configuration.
  91. ThrottleViolationRatios map[string]float64 `json:"throttleViolationRatios,omitempty"`
  92. // ThrottleReasonRatios maps a GPUThrottleReason* name to the fraction
  93. // of the window the corresponding DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
  94. // bit was set. That field is not in the default dcgm-exporter
  95. // configuration and must be enabled explicitly. Reported for the whole
  96. // physical GPU, even when MIG or time-slicing is in use.
  97. ThrottleReasonRatios map[string]float64 `json:"throttleReasonRatios,omitempty"`
  98. // MemoryUsedRatioAvg/Max are framebuffer occupancy over the window:
  99. // DCGM_FI_DEV_FB_USED / (DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE).
  100. // Default dcgm-exporter configuration.
  101. MemoryUsedRatioAvg *float64 `json:"memoryUsedRatioAvg,omitempty"`
  102. MemoryUsedRatioMax *float64 `json:"memoryUsedRatioMax,omitempty"`
  103. // MemoryPressureRatio is the fraction of the window the framebuffer
  104. // occupancy was above the configured threshold (default 0.9).
  105. MemoryPressureRatio *float64 `json:"memoryPressureRatio,omitempty"`
  106. // XIDErrorCount counts distinct XID error events observed in the window
  107. // via changes in DCGM_FI_DEV_XID_ERRORS. That field reports the last
  108. // XID code seen, so repeats of the same error code are undercounted.
  109. XIDErrorCount *float64 `json:"xidErrorCount,omitempty"`
  110. // DRAMActiveAvg/Max are the ratio of cycles the device memory interface
  111. // was active (DCGM_FI_PROF_DRAM_ACTIVE). Requires DCP profiling
  112. // (Volta+). Sustained values near 1.0 with low SMOccupancyAvg indicate
  113. // a memory-bandwidth-bound workload.
  114. DRAMActiveAvg *float64 `json:"dramActiveAvg,omitempty"`
  115. DRAMActiveMax *float64 `json:"dramActiveMax,omitempty"`
  116. // SMActiveAvg (DCGM_FI_PROF_SM_ACTIVE) and SMOccupancyAvg
  117. // (DCGM_FI_PROF_SM_OCCUPANCY) are provided so consumers can
  118. // distinguish compute-bound from bandwidth- or latency-bound
  119. // saturation. Requires DCP profiling and is commented out of the
  120. // default dcgm-exporter configuration.
  121. SMActiveAvg *float64 `json:"smActiveAvg,omitempty"`
  122. SMOccupancyAvg *float64 `json:"smOccupancyAvg,omitempty"`
  123. // PCIe/NVLink average throughput in bytes/sec over the window, from
  124. // rate() of DCGM_FI_PROF_PCIE_TX/RX_BYTES and
  125. // DCGM_FI_PROF_NVLINK_TX/RX_BYTES counters. Requires DCP profiling;
  126. // the NVLink fields are commented out of the default dcgm-exporter
  127. // configuration. Link capacity is not derivable from DCGM, so these
  128. // are raw rates rather than ratios.
  129. PCIeTxBytesAvg *float64 `json:"pcieTxBytesAvg,omitempty"`
  130. PCIeRxBytesAvg *float64 `json:"pcieRxBytesAvg,omitempty"`
  131. NVLinkTxBytesAvg *float64 `json:"nvlinkTxBytesAvg,omitempty"`
  132. NVLinkRxBytesAvg *float64 `json:"nvlinkRxBytesAvg,omitempty"`
  133. }
  134. // scalarFields returns pointers to every scalar field, so SanitizeNaN,
  135. // Equal, and IsEmpty cannot silently miss a newly added field.
  136. func (orig *GPUSaturation) scalarFields() []**float64 {
  137. return []**float64{
  138. &orig.MemoryUsedRatioAvg,
  139. &orig.MemoryUsedRatioMax,
  140. &orig.MemoryPressureRatio,
  141. &orig.XIDErrorCount,
  142. &orig.DRAMActiveAvg,
  143. &orig.DRAMActiveMax,
  144. &orig.SMActiveAvg,
  145. &orig.SMOccupancyAvg,
  146. &orig.PCIeTxBytesAvg,
  147. &orig.PCIeRxBytesAvg,
  148. &orig.NVLinkTxBytesAvg,
  149. &orig.NVLinkRxBytesAvg,
  150. }
  151. }
  152. // SanitizeNaN removes NaN values: NaN scalars become nil and NaN map entries
  153. // are deleted, so absence is always represented the same way.
  154. func (orig *GPUSaturation) SanitizeNaN() {
  155. if orig == nil {
  156. return
  157. }
  158. for _, field := range orig.scalarFields() {
  159. if *field != nil && math.IsNaN(**field) {
  160. *field = nil
  161. }
  162. }
  163. for _, m := range []map[string]float64{orig.ThrottleViolationRatios, orig.ThrottleReasonRatios} {
  164. for k, v := range m {
  165. if math.IsNaN(v) {
  166. delete(m, k)
  167. }
  168. }
  169. }
  170. }
  171. // Clone returns a deep copy of the GPUSaturation.
  172. func (orig *GPUSaturation) Clone() *GPUSaturation {
  173. if orig == nil {
  174. return nil
  175. }
  176. clone := &GPUSaturation{
  177. ThrottleViolationRatios: maps.Clone(orig.ThrottleViolationRatios),
  178. ThrottleReasonRatios: maps.Clone(orig.ThrottleReasonRatios),
  179. }
  180. origFields := orig.scalarFields()
  181. cloneFields := clone.scalarFields()
  182. for i := range origFields {
  183. if *origFields[i] != nil {
  184. v := **origFields[i]
  185. *cloneFields[i] = &v
  186. }
  187. }
  188. return clone
  189. }
  190. // Equal compares two GPUSaturation values field by field. Scalar fields are
  191. // equal when both are nil or both point to the same value.
  192. func (orig *GPUSaturation) Equal(that *GPUSaturation) bool {
  193. if orig == nil && that == nil {
  194. return true
  195. }
  196. if orig == nil || that == nil {
  197. return false
  198. }
  199. if !maps.Equal(orig.ThrottleViolationRatios, that.ThrottleViolationRatios) {
  200. return false
  201. }
  202. if !maps.Equal(orig.ThrottleReasonRatios, that.ThrottleReasonRatios) {
  203. return false
  204. }
  205. origFields := orig.scalarFields()
  206. thatFields := that.scalarFields()
  207. for i := range origFields {
  208. // inline nil/pointee comparison; converge on the shared
  209. // ptrValueEqual helper once the #3846 fix (separate PR) lands
  210. a, b := *origFields[i], *thatFields[i]
  211. if (a == nil) != (b == nil) {
  212. return false
  213. }
  214. if a != nil && *a != *b {
  215. return false
  216. }
  217. }
  218. return true
  219. }
  220. // IsEmpty reports whether no saturation signal is present at all, in which
  221. // case the GPUSaturation should be omitted rather than serialized.
  222. func (orig *GPUSaturation) IsEmpty() bool {
  223. if orig == nil {
  224. return true
  225. }
  226. if len(orig.ThrottleViolationRatios) > 0 || len(orig.ThrottleReasonRatios) > 0 {
  227. return false
  228. }
  229. for _, field := range orig.scalarFields() {
  230. if *field != nil {
  231. return false
  232. }
  233. }
  234. return true
  235. }