gpu_saturation_test.go 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. package opencost
  2. import (
  3. "math"
  4. "reflect"
  5. "sort"
  6. "testing"
  7. )
  8. func TestGPUThrottleReasonsFromMask(t *testing.T) {
  9. cases := map[string]struct {
  10. mask uint64
  11. want []string
  12. }{
  13. "zero mask": {
  14. mask: 0x0,
  15. want: []string{},
  16. },
  17. "sw power cap": {
  18. mask: GPUThrottleBitSwPowerCap,
  19. want: []string{GPUThrottleReasonSwPowerCap},
  20. },
  21. "hw slowdown": {
  22. mask: GPUThrottleBitHwSlowdown,
  23. want: []string{GPUThrottleReasonHwSlowdown},
  24. },
  25. "sync boost": {
  26. mask: GPUThrottleBitSyncBoost,
  27. want: []string{GPUThrottleReasonSyncBoost},
  28. },
  29. "sw thermal": {
  30. mask: GPUThrottleBitSwThermal,
  31. want: []string{GPUThrottleReasonSwThermal},
  32. },
  33. "hw thermal": {
  34. mask: GPUThrottleBitHwThermal,
  35. want: []string{GPUThrottleReasonHwThermal},
  36. },
  37. "hw power brake": {
  38. mask: GPUThrottleBitHwPowerBrake,
  39. want: []string{GPUThrottleReasonHwPowerBrake},
  40. },
  41. "non-saturation bits ignored": {
  42. // gpu_idle (0x1), applications_clocks_setting (0x2), and
  43. // display_clock_setting (0x100) are operating states, not
  44. // saturation, and must not decode as throttle reasons.
  45. mask: 0x1 | 0x2 | 0x100,
  46. want: []string{},
  47. },
  48. "combined saturation and non-saturation bits": {
  49. mask: 0x1 | GPUThrottleBitSwPowerCap | GPUThrottleBitHwThermal,
  50. want: []string{GPUThrottleReasonSwPowerCap, GPUThrottleReasonHwThermal},
  51. },
  52. "all saturation bits": {
  53. mask: GPUThrottleBitSwPowerCap | GPUThrottleBitHwSlowdown | GPUThrottleBitSyncBoost |
  54. GPUThrottleBitSwThermal | GPUThrottleBitHwThermal | GPUThrottleBitHwPowerBrake,
  55. want: []string{
  56. GPUThrottleReasonSwPowerCap, GPUThrottleReasonHwSlowdown, GPUThrottleReasonSyncBoost,
  57. GPUThrottleReasonSwThermal, GPUThrottleReasonHwThermal, GPUThrottleReasonHwPowerBrake,
  58. },
  59. },
  60. }
  61. for name, tc := range cases {
  62. t.Run(name, func(t *testing.T) {
  63. got := GPUThrottleReasonsFromMask(tc.mask)
  64. sort.Strings(got)
  65. want := append([]string{}, tc.want...)
  66. sort.Strings(want)
  67. if !reflect.DeepEqual(got, want) {
  68. t.Errorf("GPUThrottleReasonsFromMask(%#x) = %v, want %v", tc.mask, got, want)
  69. }
  70. })
  71. }
  72. }
  73. func TestGPUThrottleReasonBitsMatchNVML(t *testing.T) {
  74. // Bit positions are defined by NVML's nvmlClocksThrottleReasons and must
  75. // never drift: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS reports them verbatim.
  76. want := map[string]uint64{
  77. GPUThrottleReasonSwPowerCap: 0x4,
  78. GPUThrottleReasonHwSlowdown: 0x8,
  79. GPUThrottleReasonSyncBoost: 0x10,
  80. GPUThrottleReasonSwThermal: 0x20,
  81. GPUThrottleReasonHwThermal: 0x40,
  82. GPUThrottleReasonHwPowerBrake: 0x80,
  83. }
  84. if len(GPUThrottleReasons) != len(want) {
  85. t.Fatalf("GPUThrottleReasons has %d entries, want %d", len(GPUThrottleReasons), len(want))
  86. }
  87. seen := map[string]bool{}
  88. for _, reason := range GPUThrottleReasons {
  89. bit, ok := want[reason.Name]
  90. if !ok {
  91. t.Errorf("unexpected throttle reason %q", reason.Name)
  92. continue
  93. }
  94. if reason.Bit != bit {
  95. t.Errorf("throttle reason %q has bit %#x, want %#x", reason.Name, reason.Bit, bit)
  96. }
  97. if seen[reason.Name] {
  98. t.Errorf("duplicate throttle reason %q", reason.Name)
  99. }
  100. seen[reason.Name] = true
  101. }
  102. }
  103. func f64(v float64) *float64 {
  104. return &v
  105. }
  106. func mockGPUSaturation() *GPUSaturation {
  107. return &GPUSaturation{
  108. ThrottleViolationRatios: map[string]float64{
  109. GPUThrottleViolationPower: 0.25,
  110. GPUThrottleViolationThermal: 0.1,
  111. },
  112. ThrottleReasonRatios: map[string]float64{
  113. GPUThrottleReasonSwPowerCap: 0.2,
  114. GPUThrottleReasonHwThermal: 0.05,
  115. },
  116. MemoryUsedRatioAvg: f64(0.85),
  117. MemoryUsedRatioMax: f64(0.99),
  118. MemoryPressureRatio: f64(0.4),
  119. XIDErrorCount: f64(2),
  120. DRAMActiveAvg: f64(0.7),
  121. DRAMActiveMax: f64(0.95),
  122. SMActiveAvg: f64(0.6),
  123. SMOccupancyAvg: f64(0.5),
  124. PCIeTxBytesAvg: f64(1.5e9),
  125. PCIeRxBytesAvg: f64(2.5e9),
  126. NVLinkTxBytesAvg: f64(3.5e9),
  127. NVLinkRxBytesAvg: f64(4.5e9),
  128. }
  129. }
  130. func TestGPUSaturation_SanitizeNaN(t *testing.T) {
  131. nan := math.NaN()
  132. sat := mockGPUSaturation()
  133. sat.MemoryUsedRatioAvg = &nan
  134. sat.SMActiveAvg = &nan
  135. sat.ThrottleViolationRatios[GPUThrottleViolationSyncBoost] = math.NaN()
  136. sat.ThrottleReasonRatios[GPUThrottleReasonHwSlowdown] = math.NaN()
  137. sat.SanitizeNaN()
  138. if sat.MemoryUsedRatioAvg != nil {
  139. t.Errorf("expected NaN MemoryUsedRatioAvg to be nil")
  140. }
  141. if sat.SMActiveAvg != nil {
  142. t.Errorf("expected NaN SMActiveAvg to be nil")
  143. }
  144. if _, ok := sat.ThrottleViolationRatios[GPUThrottleViolationSyncBoost]; ok {
  145. t.Errorf("expected NaN throttle violation entry to be removed")
  146. }
  147. if _, ok := sat.ThrottleReasonRatios[GPUThrottleReasonHwSlowdown]; ok {
  148. t.Errorf("expected NaN throttle reason entry to be removed")
  149. }
  150. // non-NaN values survive
  151. if sat.MemoryUsedRatioMax == nil || *sat.MemoryUsedRatioMax != 0.99 {
  152. t.Errorf("expected MemoryUsedRatioMax to survive sanitization")
  153. }
  154. if v, ok := sat.ThrottleViolationRatios[GPUThrottleViolationPower]; !ok || v != 0.25 {
  155. t.Errorf("expected power violation ratio to survive sanitization")
  156. }
  157. // nil receiver must not panic
  158. var nilSat *GPUSaturation
  159. nilSat.SanitizeNaN()
  160. }
  161. func TestGPUSaturation_Clone(t *testing.T) {
  162. var nilSat *GPUSaturation
  163. if nilSat.Clone() != nil {
  164. t.Fatalf("expected nil Clone of nil GPUSaturation")
  165. }
  166. orig := mockGPUSaturation()
  167. clone := orig.Clone()
  168. if !orig.Equal(clone) {
  169. t.Fatalf("expected clone to equal original")
  170. }
  171. // deep copy: mutating the clone must not affect the original
  172. clone.ThrottleViolationRatios[GPUThrottleViolationPower] = 0.99
  173. *clone.MemoryUsedRatioAvg = 0.1
  174. if orig.ThrottleViolationRatios[GPUThrottleViolationPower] == 0.99 {
  175. t.Errorf("clone shares ThrottleViolationRatios map with original")
  176. }
  177. if *orig.MemoryUsedRatioAvg == 0.1 {
  178. t.Errorf("clone shares MemoryUsedRatioAvg pointer with original")
  179. }
  180. }
  181. func TestGPUSaturation_Equal(t *testing.T) {
  182. cases := map[string]struct {
  183. a, b *GPUSaturation
  184. want bool
  185. }{
  186. "both nil": {nil, nil, true},
  187. "one nil": {mockGPUSaturation(), nil, false},
  188. "identical": {mockGPUSaturation(), mockGPUSaturation(), true},
  189. "different scalar": {
  190. mockGPUSaturation(),
  191. func() *GPUSaturation { s := mockGPUSaturation(); s.SMActiveAvg = f64(0.99); return s }(),
  192. false,
  193. },
  194. "nil vs set scalar": {
  195. mockGPUSaturation(),
  196. func() *GPUSaturation { s := mockGPUSaturation(); s.SMActiveAvg = nil; return s }(),
  197. false,
  198. },
  199. "different map value": {
  200. mockGPUSaturation(),
  201. func() *GPUSaturation {
  202. s := mockGPUSaturation()
  203. s.ThrottleReasonRatios[GPUThrottleReasonSwPowerCap] = 0.99
  204. return s
  205. }(),
  206. false,
  207. },
  208. "missing map key": {
  209. mockGPUSaturation(),
  210. func() *GPUSaturation {
  211. s := mockGPUSaturation()
  212. delete(s.ThrottleReasonRatios, GPUThrottleReasonSwPowerCap)
  213. return s
  214. }(),
  215. false,
  216. },
  217. "empty": {&GPUSaturation{}, &GPUSaturation{}, true},
  218. }
  219. for name, tc := range cases {
  220. t.Run(name, func(t *testing.T) {
  221. if got := tc.a.Equal(tc.b); got != tc.want {
  222. t.Errorf("Equal() = %v, want %v", got, tc.want)
  223. }
  224. if got := tc.b.Equal(tc.a); got != tc.want {
  225. t.Errorf("Equal() reversed = %v, want %v", got, tc.want)
  226. }
  227. })
  228. }
  229. }
  230. func TestGPUSaturation_IsEmpty(t *testing.T) {
  231. var nilSat *GPUSaturation
  232. if !nilSat.IsEmpty() {
  233. t.Errorf("expected nil GPUSaturation to be empty")
  234. }
  235. if !(&GPUSaturation{}).IsEmpty() {
  236. t.Errorf("expected zero GPUSaturation to be empty")
  237. }
  238. if !(&GPUSaturation{ThrottleReasonRatios: map[string]float64{}}).IsEmpty() {
  239. t.Errorf("expected GPUSaturation with empty map to be empty")
  240. }
  241. if mockGPUSaturation().IsEmpty() {
  242. t.Errorf("expected populated GPUSaturation to be non-empty")
  243. }
  244. if (&GPUSaturation{XIDErrorCount: f64(1)}).IsEmpty() {
  245. t.Errorf("expected GPUSaturation with one field to be non-empty")
  246. }
  247. }
  248. // TestGPUAllocation_BinaryRoundtripWithSaturation verifies that a
  249. // GPUAllocation carrying saturation data survives bingen binary
  250. // marshal/unmarshal, and that absent saturation stays absent.
  251. func TestGPUAllocation_BinaryRoundtripWithSaturation(t *testing.T) {
  252. shared := false
  253. cases := map[string]*GPUAllocation{
  254. "nil saturation": {
  255. GPUDevice: "nvidia0",
  256. GPUModel: "Tesla T4",
  257. GPUUUID: "GPU-1",
  258. IsGPUShared: &shared,
  259. GPUUsageAverage: f64(0.5),
  260. },
  261. "populated saturation": {
  262. GPUDevice: "nvidia0",
  263. GPUModel: "Tesla T4",
  264. GPUUUID: "GPU-1",
  265. Saturation: mockGPUSaturation(),
  266. },
  267. "empty saturation struct": {
  268. GPUDevice: "nvidia0",
  269. Saturation: &GPUSaturation{},
  270. },
  271. }
  272. for name, orig := range cases {
  273. t.Run(name, func(t *testing.T) {
  274. bs, err := orig.MarshalBinary()
  275. if err != nil {
  276. t.Fatalf("MarshalBinary: %s", err)
  277. }
  278. decoded := new(GPUAllocation)
  279. if err := decoded.UnmarshalBinary(bs); err != nil {
  280. t.Fatalf("UnmarshalBinary: %s", err)
  281. }
  282. if decoded.GPUDevice != orig.GPUDevice || decoded.GPUModel != orig.GPUModel || decoded.GPUUUID != orig.GPUUUID {
  283. t.Errorf("device identity did not survive roundtrip: got %+v, want %+v", decoded, orig)
  284. }
  285. if (decoded.Saturation == nil) != (orig.Saturation == nil) {
  286. t.Fatalf("saturation presence did not survive roundtrip: got %v, want %v", decoded.Saturation, orig.Saturation)
  287. }
  288. if !decoded.Saturation.Equal(orig.Saturation) {
  289. t.Errorf("saturation did not survive roundtrip: got %+v, want %+v", decoded.Saturation, orig.Saturation)
  290. }
  291. })
  292. }
  293. }