dcgm_saturation_test.go 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. package kubemodel
  2. import (
  3. "testing"
  4. "github.com/opencost/opencost/core/pkg/model/kubemodel"
  5. "github.com/opencost/opencost/core/pkg/source"
  6. "github.com/opencost/opencost/core/pkg/util"
  7. )
  8. func saturationDeviceMap() map[string]*kubemodel.DCGMDevice {
  9. return map[string]*kubemodel.DCGMDevice{
  10. "GPU-1": {UUID: "GPU-1", Device: "nvidia0", ModelName: "Tesla T4"},
  11. "GPU-2": {UUID: "GPU-2", Device: "nvidia1", ModelName: "Tesla T4"},
  12. }
  13. }
  14. func deviceSaturationResult(uuid, container, reason string, value float64) *source.GPUSaturationResult {
  15. return &source.GPUSaturationResult{
  16. UUID: uuid,
  17. Container: container,
  18. Pod: "pod-" + container,
  19. Namespace: "ns",
  20. Reason: reason,
  21. Data: []*util.Vector{{Value: value}},
  22. }
  23. }
  24. func TestApplyDeviceSaturationScalar(t *testing.T) {
  25. deviceMap := saturationDeviceMap()
  26. results := []*source.GPUSaturationResult{
  27. // dcgm-exporter duplicates the device-level value onto every
  28. // container sharing the device: both rows carry the same value
  29. deviceSaturationResult("GPU-1", "container-a", "", 0.85),
  30. deviceSaturationResult("GPU-1", "container-b", "", 0.85),
  31. // unknown device must be skipped
  32. deviceSaturationResult("GPU-9", "container-c", "", 0.5),
  33. }
  34. noData := deviceSaturationResult("GPU-2", "container-d", "", 0)
  35. noData.Data = nil
  36. results = append(results, noData)
  37. applyDeviceSaturationScalar(deviceMap, results, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.MemoryUsedRatioAvg = &v })
  38. sat := deviceMap["GPU-1"].Saturation
  39. if sat == nil || sat.MemoryUsedRatioAvg == nil || *sat.MemoryUsedRatioAvg != 0.85 {
  40. t.Errorf("GPU-1 saturation = %+v, want MemoryUsedRatioAvg 0.85", sat)
  41. }
  42. // device with only an empty-data result must keep Saturation nil
  43. if deviceMap["GPU-2"].Saturation != nil {
  44. t.Errorf("GPU-2 saturation should stay nil for empty results, got %+v", deviceMap["GPU-2"].Saturation)
  45. }
  46. }
  47. func TestApplyDeviceThrottleRatios(t *testing.T) {
  48. deviceMap := saturationDeviceMap()
  49. applyDeviceThrottleRatios(deviceMap, []*source.GPUSaturationResult{
  50. deviceSaturationResult("GPU-1", "container-a", "power", 0.25),
  51. deviceSaturationResult("GPU-1", "container-a", "thermal", 0.1),
  52. // duplicate attribution from a sharing container, same value
  53. deviceSaturationResult("GPU-1", "container-b", "power", 0.25),
  54. // missing reason must be dropped without creating saturation
  55. deviceSaturationResult("GPU-2", "container-c", "", 0.5),
  56. }, func(sat *kubemodel.DCGMDeviceSaturation) map[string]float64 {
  57. if sat.ThrottleViolationRatios == nil {
  58. sat.ThrottleViolationRatios = make(map[string]float64)
  59. }
  60. return sat.ThrottleViolationRatios
  61. })
  62. sat := deviceMap["GPU-1"].Saturation
  63. if sat == nil {
  64. t.Fatalf("expected saturation on GPU-1")
  65. }
  66. if len(sat.ThrottleViolationRatios) != 2 || sat.ThrottleViolationRatios["power"] != 0.25 || sat.ThrottleViolationRatios["thermal"] != 0.1 {
  67. t.Errorf("ThrottleViolationRatios = %v", sat.ThrottleViolationRatios)
  68. }
  69. if deviceMap["GPU-2"].Saturation != nil {
  70. t.Errorf("reasonless result must not create saturation, got %+v", deviceMap["GPU-2"].Saturation)
  71. }
  72. }
  73. // TestDCGMSaturationAwaitAndApply runs the full reduction over a fabricated
  74. // future bundle, verifying every signal lands on the right device and that
  75. // devices without signals keep Saturation nil.
  76. func TestDCGMSaturationAwaitAndApply(t *testing.T) {
  77. makeFuture := func(results ...*source.GPUSaturationResult) *source.QueryGroupFuture[source.GPUSaturationResult] {
  78. queryResults := source.NewQueryResults("test")
  79. for _, res := range results {
  80. metrics := map[string]any{
  81. "UUID": res.UUID,
  82. "container": res.Container,
  83. "pod": res.Pod,
  84. "namespace": res.Namespace,
  85. }
  86. if res.Reason != "" {
  87. metrics["reason"] = res.Reason
  88. }
  89. queryResults.Results = append(queryResults.Results, source.NewQueryResult(metrics, res.Data, nil))
  90. }
  91. ch := make(source.QueryResultsChan, 1)
  92. ch <- queryResults
  93. grp := source.NewQueryGroup()
  94. return source.WithGroup(grp, source.NewFuture(source.DecodeGPUSaturationResult, ch))
  95. }
  96. deviceMap := saturationDeviceMap()
  97. futures := &dcgmSaturationFutures{
  98. throttleViolation: makeFuture(deviceSaturationResult("GPU-1", "c", "power", 0.25)),
  99. throttleReason: makeFuture(deviceSaturationResult("GPU-1", "c", "sw_power_cap", 0.2)),
  100. memoryUsedAvg: makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.85)),
  101. memoryUsedMax: makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.99)),
  102. memoryPressure: makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.4)),
  103. xidErrorCount: makeFuture(deviceSaturationResult("GPU-1", "c", "", 2)),
  104. dramActiveAvg: makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.7)),
  105. dramActiveMax: makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.95)),
  106. smActiveAvg: makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.6)),
  107. smOccupancyAvg: makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.5)),
  108. pcieTxBytesAvg: makeFuture(deviceSaturationResult("GPU-1", "c", "", 1.5e9)),
  109. pcieRxBytesAvg: makeFuture(deviceSaturationResult("GPU-1", "c", "", 2.5e9)),
  110. nvlinkTxBytesAvg: makeFuture(),
  111. nvlinkRxBytesAvg: makeFuture(),
  112. }
  113. futures.awaitAndApply(deviceMap)
  114. sat := deviceMap["GPU-1"].Saturation
  115. if sat == nil {
  116. t.Fatalf("expected saturation on GPU-1")
  117. }
  118. if err := sat.Validate(); err != nil {
  119. t.Fatalf("hydrated saturation invalid: %v", err)
  120. }
  121. if sat.ThrottleViolationRatios["power"] != 0.25 || sat.ThrottleReasonRatios["sw_power_cap"] != 0.2 {
  122. t.Errorf("throttle ratios = %v / %v", sat.ThrottleViolationRatios, sat.ThrottleReasonRatios)
  123. }
  124. scalarChecks := map[string]*float64{
  125. "MemoryUsedRatioAvg=0.85": sat.MemoryUsedRatioAvg,
  126. "MemoryUsedRatioMax=0.99": sat.MemoryUsedRatioMax,
  127. "MemoryPressureRatio=0.4": sat.MemoryPressureRatio,
  128. "XIDErrorCount=2": sat.XIDErrorCount,
  129. "DRAMActiveAvg=0.7": sat.DRAMActiveAvg,
  130. "DRAMActiveMax=0.95": sat.DRAMActiveMax,
  131. "SMActiveAvg=0.6": sat.SMActiveAvg,
  132. "SMOccupancyAvg=0.5": sat.SMOccupancyAvg,
  133. "PCIeTxBytesAvg=1.5e9": sat.PCIeTxBytesAvg,
  134. "PCIeRxBytesAvg=2.5e9": sat.PCIeRxBytesAvg,
  135. }
  136. for name, ptr := range scalarChecks {
  137. if ptr == nil {
  138. t.Errorf("%s: signal missing", name)
  139. }
  140. }
  141. // NVLink queries returned nothing: absent, not zero
  142. if sat.NVLinkTxBytesAvg != nil || sat.NVLinkRxBytesAvg != nil {
  143. t.Errorf("expected NVLink signals to stay nil, got %v / %v", sat.NVLinkTxBytesAvg, sat.NVLinkRxBytesAvg)
  144. }
  145. // no signals targeted GPU-2: Saturation stays nil
  146. if deviceMap["GPU-2"].Saturation != nil {
  147. t.Errorf("GPU-2 saturation should be nil, got %+v", deviceMap["GPU-2"].Saturation)
  148. }
  149. // nil bundle (feature disabled) must be a no-op
  150. var disabled *dcgmSaturationFutures
  151. disabled.awaitAndApply(deviceMap)
  152. }
  153. // TestDCGMDeviceMetricAwaitAndApply verifies device-level metrics land with
  154. // correct unit scaling and that devices without series keep zero values.
  155. func TestDCGMDeviceMetricAwaitAndApply(t *testing.T) {
  156. makeFuture := func(results ...*source.GPUDeviceMetricResult) *source.QueryGroupFuture[source.GPUDeviceMetricResult] {
  157. queryResults := source.NewQueryResults("test")
  158. for _, res := range results {
  159. queryResults.Results = append(queryResults.Results, source.NewQueryResult(map[string]any{
  160. "UUID": res.UUID,
  161. }, res.Data, nil))
  162. }
  163. ch := make(source.QueryResultsChan, 1)
  164. ch <- queryResults
  165. grp := source.NewQueryGroup()
  166. return source.WithGroup(grp, source.NewFuture(source.DecodeGPUDeviceMetricResult, ch))
  167. }
  168. metricResult := func(uuid string, value float64) *source.GPUDeviceMetricResult {
  169. return &source.GPUDeviceMetricResult{UUID: uuid, Data: []*util.Vector{{Value: value}}}
  170. }
  171. deviceMap := saturationDeviceMap()
  172. futures := &dcgmDeviceMetricFutures{
  173. powerAvg: makeFuture(metricResult("GPU-1", 140)),
  174. tempAvg: makeFuture(metricResult("GPU-1", 55)),
  175. usageAvg: makeFuture(metricResult("GPU-1", 0.425)),
  176. usageMax: makeFuture(metricResult("GPU-1", 0.97)),
  177. memoryUsedAvg: makeFuture(metricResult("GPU-1", 1024)), // MiB
  178. memoryUsedMax: makeFuture(metricResult("GPU-1", 2048)),
  179. }
  180. futures.awaitAndApply(deviceMap)
  181. d := deviceMap["GPU-1"]
  182. if d.PowerWatts != 140 || d.TemperatureCelsius != 55 {
  183. t.Errorf("power/temp = (%v, %v), want (140, 55)", d.PowerWatts, d.TemperatureCelsius)
  184. }
  185. // GR_ENGINE_ACTIVE ratio scaled to percent
  186. if d.ComputeUtilizationAvg != 42.5 || d.ComputeUtilizationMax != 97 {
  187. t.Errorf("compute util = (%v, %v), want (42.5, 97)", d.ComputeUtilizationAvg, d.ComputeUtilizationMax)
  188. }
  189. // FB_USED MiB scaled to bytes
  190. if d.MemoryUsedBytesAvg != 1024*1024*1024 || d.MemoryUsedBytesMax != 2048*1024*1024 {
  191. t.Errorf("memory bytes = (%v, %v)", d.MemoryUsedBytesAvg, d.MemoryUsedBytesMax)
  192. }
  193. // no series targeted GPU-2: untouched zeros
  194. if deviceMap["GPU-2"].PowerWatts != 0 || deviceMap["GPU-2"].ComputeUtilizationAvg != 0 {
  195. t.Errorf("GPU-2 must stay zero: %+v", deviceMap["GPU-2"])
  196. }
  197. // nil bundle is a no-op
  198. var disabled *dcgmDeviceMetricFutures
  199. disabled.awaitAndApply(deviceMap)
  200. }