dcgm_saturation_test.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. package kubemodel
  2. import (
  3. "math"
  4. "reflect"
  5. "testing"
  6. "time"
  7. )
  8. func saturationFloat(v float64) *float64 {
  9. return &v
  10. }
  11. func mockDCGMDeviceSaturation() *DCGMDeviceSaturation {
  12. return &DCGMDeviceSaturation{
  13. ThrottleViolationRatios: map[string]float64{"power": 0.25},
  14. ThrottleReasonRatios: map[string]float64{"sw_power_cap": 0.2},
  15. MemoryUsedRatioAvg: saturationFloat(0.85),
  16. MemoryUsedRatioMax: saturationFloat(0.99),
  17. MemoryPressureRatio: saturationFloat(0.4),
  18. XIDErrorCount: saturationFloat(2),
  19. DRAMActiveAvg: saturationFloat(0.7),
  20. DRAMActiveMax: saturationFloat(0.95),
  21. SMActiveAvg: saturationFloat(0.6),
  22. SMOccupancyAvg: saturationFloat(0.5),
  23. PCIeTxBytesAvg: saturationFloat(1.5e9),
  24. PCIeRxBytesAvg: saturationFloat(2.5e9),
  25. NVLinkTxBytesAvg: saturationFloat(3.5e9),
  26. NVLinkRxBytesAvg: saturationFloat(4.5e9),
  27. }
  28. }
  29. func mockSaturatedDCGMDevice() *DCGMDevice {
  30. return &DCGMDevice{
  31. UUID: "GPU-1",
  32. Start: time.Date(2026, 6, 1, 0, 0, 0, 0, time.UTC),
  33. End: time.Date(2026, 6, 1, 1, 0, 0, 0, time.UTC),
  34. Device: "nvidia0",
  35. ModelName: "NVIDIA A100 80GB",
  36. Saturation: mockDCGMDeviceSaturation(),
  37. PowerWatts: 140,
  38. TemperatureCelsius: 55,
  39. ComputeUtilizationAvg: 42.5,
  40. ComputeUtilizationMax: 97,
  41. MemoryUsedBytesAvg: 32e9,
  42. MemoryUsedBytesMax: 71e9,
  43. }
  44. }
  45. // TestDCGMDevice_DeviceInfoInterface verifies the vendor-neutral identity
  46. // surface on the DCGM device.
  47. func TestDCGMDevice_DeviceInfoInterface(t *testing.T) {
  48. var info DeviceInfo = mockSaturatedDCGMDevice()
  49. if info.GetIdentifier() != "GPU-1" {
  50. t.Errorf("GetIdentifier() = %q, want GPU-1", info.GetIdentifier())
  51. }
  52. if info.GetType() != "GPU" {
  53. t.Errorf("GetType() = %q, want GPU", info.GetType())
  54. }
  55. if info.GetName() != "NVIDIA A100 80GB" {
  56. t.Errorf("GetName() = %q", info.GetName())
  57. }
  58. if !info.GetStart().Before(info.GetEnd()) {
  59. t.Errorf("GetStart() not before GetEnd()")
  60. }
  61. if info.GetPower() != 140 {
  62. t.Errorf("GetPower() = %v, want 140", info.GetPower())
  63. }
  64. // MIG parentage is not derivable from dcgm-exporter labels yet
  65. if info.GetParent() != "" {
  66. t.Errorf("expected empty parent until a mapping source exists, got %q", info.GetParent())
  67. }
  68. }
  69. // TestDCGMDevice_DevicePerformanceInterface verifies the performance surface
  70. // is backed by the device-level metric fields with documented units.
  71. func TestDCGMDevice_DevicePerformanceInterface(t *testing.T) {
  72. var perf DevicePerformance = mockSaturatedDCGMDevice()
  73. if perf.GetComputeUtilizationAverage() != 42.5 || perf.GetComputeUtilizationMax() != 97 {
  74. t.Errorf("compute utilization = (%v, %v), want (42.5, 97)", perf.GetComputeUtilizationAverage(), perf.GetComputeUtilizationMax())
  75. }
  76. if perf.GetMemoryUtilizationAverage() != 32e9 || perf.GetMemoryUtilizationMax() != 71e9 {
  77. t.Errorf("memory utilization = (%v, %v), want (3.2e10, 7.1e10)", perf.GetMemoryUtilizationAverage(), perf.GetMemoryUtilizationMax())
  78. }
  79. if perf.GetTemp() != 55 {
  80. t.Errorf("GetTemp() = %v, want 55", perf.GetTemp())
  81. }
  82. }
  83. // TestDCGMDevice_DeviceSaturationInterface verifies the saturation getters
  84. // expose every signal with correct values, that absence (nil Saturation or
  85. // nil field) reports ok=false / nil rather than zero, and that returned
  86. // maps are copies.
  87. func TestDCGMDevice_DeviceSaturationInterface(t *testing.T) {
  88. var sat DeviceSaturation = mockSaturatedDCGMDevice()
  89. scalarChecks := map[string]struct {
  90. get func() (float64, bool)
  91. want float64
  92. }{
  93. "MemoryUsedRatioAvg": {sat.GetMemoryUsedRatioAvg, 0.85},
  94. "MemoryUsedRatioMax": {sat.GetMemoryUsedRatioMax, 0.99},
  95. "MemoryPressureRatio": {sat.GetMemoryPressureRatio, 0.4},
  96. "ErrorEventCount": {sat.GetErrorEventCount, 2},
  97. "MemoryBandwidthActiveAvg": {sat.GetMemoryBandwidthActiveAvg, 0.7},
  98. "MemoryBandwidthActiveMax": {sat.GetMemoryBandwidthActiveMax, 0.95},
  99. "ComputeActiveAvg": {sat.GetComputeActiveAvg, 0.6},
  100. "ComputeOccupancyAvg": {sat.GetComputeOccupancyAvg, 0.5},
  101. "HostLinkTxBytesAvg": {sat.GetHostLinkTxBytesAvg, 1.5e9},
  102. "HostLinkRxBytesAvg": {sat.GetHostLinkRxBytesAvg, 2.5e9},
  103. "PeerLinkTxBytesAvg": {sat.GetPeerLinkTxBytesAvg, 3.5e9},
  104. "PeerLinkRxBytesAvg": {sat.GetPeerLinkRxBytesAvg, 4.5e9},
  105. }
  106. for name, check := range scalarChecks {
  107. got, ok := check.get()
  108. if !ok || got != check.want {
  109. t.Errorf("%s = (%v, %v), want (%v, true)", name, got, ok, check.want)
  110. }
  111. }
  112. if got := sat.GetThrottleViolationRatios(); got["power"] != 0.25 {
  113. t.Errorf("GetThrottleViolationRatios() = %v, want power: 0.25", got)
  114. }
  115. if got := sat.GetThrottleReasonRatios(); got["sw_power_cap"] != 0.2 {
  116. t.Errorf("GetThrottleReasonRatios() = %v, want sw_power_cap: 0.2", got)
  117. }
  118. // returned maps are copies: consumers must not mutate device state
  119. sat.GetThrottleViolationRatios()["power"] = 0.99
  120. if got := sat.GetThrottleViolationRatios(); got["power"] != 0.25 {
  121. t.Errorf("interface exposed internal throttle map: %v", got)
  122. }
  123. // absence semantics: nil Saturation reports ok=false / nil, never zero
  124. var absent DeviceSaturation = &DCGMDevice{UUID: "GPU-2"}
  125. if _, ok := absent.GetMemoryUsedRatioAvg(); ok {
  126. t.Errorf("expected ok=false for device without saturation")
  127. }
  128. if got := absent.GetThrottleViolationRatios(); got != nil {
  129. t.Errorf("expected nil throttle map for device without saturation, got %v", got)
  130. }
  131. // per-field absence: present Saturation with one nil field
  132. partial := mockSaturatedDCGMDevice()
  133. partial.Saturation.SMActiveAvg = nil
  134. if _, ok := partial.GetComputeActiveAvg(); ok {
  135. t.Errorf("expected ok=false for absent SM active signal")
  136. }
  137. if got, ok := partial.GetComputeOccupancyAvg(); !ok || got != 0.5 {
  138. t.Errorf("sibling signal lost: (%v, %v), want (0.5, true)", got, ok)
  139. }
  140. }
  141. func TestDCGMDeviceSaturation_Validate(t *testing.T) {
  142. cases := map[string]struct {
  143. mutate func(*DCGMDeviceSaturation)
  144. wantErr bool
  145. }{
  146. "valid": {mutate: func(s *DCGMDeviceSaturation) {}, wantErr: false},
  147. "nil ok": {mutate: nil, wantErr: false},
  148. "empty": {mutate: func(s *DCGMDeviceSaturation) { *s = DCGMDeviceSaturation{} }, wantErr: false},
  149. "ratio above one": {
  150. mutate: func(s *DCGMDeviceSaturation) { s.MemoryUsedRatioAvg = saturationFloat(1.1) },
  151. wantErr: true,
  152. },
  153. "negative ratio": {
  154. mutate: func(s *DCGMDeviceSaturation) { s.SMActiveAvg = saturationFloat(-0.1) },
  155. wantErr: true,
  156. },
  157. "NaN ratio": {
  158. mutate: func(s *DCGMDeviceSaturation) { s.DRAMActiveAvg = saturationFloat(math.NaN()) },
  159. wantErr: true,
  160. },
  161. "throttle map ratio above one": {
  162. mutate: func(s *DCGMDeviceSaturation) { s.ThrottleReasonRatios["sw_power_cap"] = 1.5 },
  163. wantErr: true,
  164. },
  165. "negative byte rate": {
  166. mutate: func(s *DCGMDeviceSaturation) { s.PCIeTxBytesAvg = saturationFloat(-1) },
  167. wantErr: true,
  168. },
  169. "negative xid count": {
  170. mutate: func(s *DCGMDeviceSaturation) { s.XIDErrorCount = saturationFloat(-1) },
  171. wantErr: true,
  172. },
  173. }
  174. for name, tc := range cases {
  175. t.Run(name, func(t *testing.T) {
  176. var sat *DCGMDeviceSaturation
  177. if tc.mutate != nil {
  178. sat = mockDCGMDeviceSaturation()
  179. tc.mutate(sat)
  180. }
  181. err := sat.Validate()
  182. if (err != nil) != tc.wantErr {
  183. t.Errorf("Validate() error = %v, wantErr %v", err, tc.wantErr)
  184. }
  185. })
  186. }
  187. // invalid saturation must fail device validation too
  188. device := mockSaturatedDCGMDevice()
  189. device.Saturation.MemoryUsedRatioAvg = saturationFloat(2.0)
  190. window := Window{Start: device.Start, End: device.End}
  191. if err := device.ValidateDCGMDevice(window); err == nil {
  192. t.Errorf("expected ValidateDCGMDevice to reject invalid saturation")
  193. }
  194. }
  195. func TestDCGMDeviceSaturation_Clone(t *testing.T) {
  196. var nilSat *DCGMDeviceSaturation
  197. if nilSat.Clone() != nil {
  198. t.Fatalf("expected nil Clone of nil DCGMDeviceSaturation")
  199. }
  200. orig := mockDCGMDeviceSaturation()
  201. clone := orig.Clone()
  202. if !reflect.DeepEqual(orig, clone) {
  203. t.Fatalf("clone differs from original:\n got %+v\nwant %+v", clone, orig)
  204. }
  205. clone.ThrottleViolationRatios["power"] = 0.99
  206. *clone.MemoryUsedRatioAvg = 0.1
  207. if orig.ThrottleViolationRatios["power"] == 0.99 {
  208. t.Errorf("clone shares ThrottleViolationRatios map with original")
  209. }
  210. if *orig.MemoryUsedRatioAvg == 0.1 {
  211. t.Errorf("clone shares MemoryUsedRatioAvg pointer with original")
  212. }
  213. }
  214. // TestDCGMDevice_BinaryRoundtripWithSaturation verifies a DCGMDevice
  215. // carrying saturation survives the bingen binary codec, and that absent
  216. // saturation stays absent.
  217. func TestDCGMDevice_BinaryRoundtripWithSaturation(t *testing.T) {
  218. cases := map[string]*DCGMDevice{
  219. "nil saturation": {
  220. UUID: "GPU-1",
  221. Device: "nvidia0",
  222. ModelName: "Tesla T4",
  223. },
  224. "populated saturation": mockSaturatedDCGMDevice(),
  225. "empty saturation struct": {
  226. UUID: "GPU-1",
  227. Saturation: &DCGMDeviceSaturation{},
  228. },
  229. }
  230. for name, orig := range cases {
  231. t.Run(name, func(t *testing.T) {
  232. bs, err := orig.MarshalBinary()
  233. if err != nil {
  234. t.Fatalf("MarshalBinary: %s", err)
  235. }
  236. decoded := new(DCGMDevice)
  237. if err := decoded.UnmarshalBinary(bs); err != nil {
  238. t.Fatalf("UnmarshalBinary: %s", err)
  239. }
  240. if decoded.UUID != orig.UUID || decoded.Device != orig.Device || decoded.ModelName != orig.ModelName {
  241. t.Errorf("device identity did not survive roundtrip: got %+v, want %+v", decoded, orig)
  242. }
  243. if decoded.PowerWatts != orig.PowerWatts || decoded.TemperatureCelsius != orig.TemperatureCelsius ||
  244. decoded.ComputeUtilizationAvg != orig.ComputeUtilizationAvg || decoded.ComputeUtilizationMax != orig.ComputeUtilizationMax ||
  245. decoded.MemoryUsedBytesAvg != orig.MemoryUsedBytesAvg || decoded.MemoryUsedBytesMax != orig.MemoryUsedBytesMax {
  246. t.Errorf("device metrics did not survive roundtrip: got %+v, want %+v", decoded, orig)
  247. }
  248. if (decoded.Saturation == nil) != (orig.Saturation == nil) {
  249. t.Fatalf("saturation presence did not survive roundtrip: got %v, want %v", decoded.Saturation, orig.Saturation)
  250. }
  251. if !reflect.DeepEqual(decoded.Saturation, orig.Saturation) {
  252. t.Errorf("saturation did not survive roundtrip: got %+v, want %+v", decoded.Saturation, orig.Saturation)
  253. }
  254. })
  255. }
  256. }