gpusaturationquerier_test.go 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. package collector
  2. import (
  3. "math"
  4. "testing"
  5. "time"
  6. "github.com/opencost/opencost/core/pkg/opencost"
  7. "github.com/opencost/opencost/core/pkg/source"
  8. "github.com/opencost/opencost/modules/collector-source/pkg/metric"
  9. )
  10. // gpuSaturationMockProvider builds a store with one hour of DCGM saturation
  11. // samples for a single GPU container. SM_ACTIVE, SM_OCCUPANCY, and NVLink
  12. // metrics are deliberately never updated to exercise absent-signal behavior.
  13. func gpuSaturationMockProvider(t *testing.T) (StoreProvider, time.Time, time.Time) {
  14. t.Helper()
  15. t.Setenv("GPU_MEMORY_SATURATION_THRESHOLD", "0.6")
  16. start, _ := time.Parse(time.RFC3339, Start1Str)
  17. end, _ := time.Parse(time.RFC3339, End1Str)
  18. gpuInfo := map[string]string{
  19. source.NamespaceLabel: "namespace1",
  20. source.PodLabel: "pod1",
  21. source.PodUIDLabel: "pod-uuid1",
  22. source.ContainerLabel: "container1",
  23. source.DeviceLabel: "nvidia0",
  24. source.ModelNameLabel: "Tesla T4",
  25. source.UUIDLabel: "GPU-1",
  26. source.MIGProfileLabel: "",
  27. source.MIGInstanceLabel: "",
  28. }
  29. store := NewOpenCostMetricStore()
  30. // power violation counter: 1.8e9us accumulated over a 3.6e9us window
  31. store.Update(metric.DCGMFIDEVPOWERVIOLATION, gpuInfo, 0, start, nil)
  32. store.Update(metric.DCGMFIDEVPOWERVIOLATION, gpuInfo, 1.8e9, end, nil)
  33. // throttle bitmask (legacy field name): sw_power_cap set in 1 of 2 samples
  34. store.Update(metric.DCGMFIDEVCLOCKTHROTTLEREASONS, gpuInfo, 0x4, start, nil)
  35. store.Update(metric.DCGMFIDEVCLOCKTHROTTLEREASONS, gpuInfo, 0x0, end, nil)
  36. // framebuffer occupancy ratio, as synthesized from FB_USED/FB_FREE per
  37. // scrape by GPUMemoryUsedRatioSynthesizer (see synthetic package tests
  38. // for the join itself): avg 0.625, max 0.75, half of samples >= 0.6
  39. store.Update(metric.OpencostGPUMemoryUsedRatio, gpuInfo, 0.5, start, nil)
  40. store.Update(metric.OpencostGPUMemoryUsedRatio, gpuInfo, 0.75, end, nil)
  41. // one XID error transition
  42. store.Update(metric.DCGMFIDEVXIDERRORS, gpuInfo, 0, start, nil)
  43. store.Update(metric.DCGMFIDEVXIDERRORS, gpuInfo, 13, end, nil)
  44. // DRAM activity gauge
  45. store.Update(metric.DCGMFIPROFDRAMACTIVE, gpuInfo, 0.5, start, nil)
  46. store.Update(metric.DCGMFIPROFDRAMACTIVE, gpuInfo, 0.7, end, nil)
  47. // PCIe tx counter: 3.6e12 bytes over 3600s = 1e9 bytes/sec
  48. store.Update(metric.DCGMFIPROFPCIETXBYTES, gpuInfo, 0, start, nil)
  49. store.Update(metric.DCGMFIPROFPCIETXBYTES, gpuInfo, 3.6e12, end, nil)
  50. return &MockStoreProvider{metricsCollector: store}, start, end
  51. }
  52. func awaitGPUSaturation(t *testing.T, f *source.Future[source.GPUSaturationResult]) []*source.GPUSaturationResult {
  53. t.Helper()
  54. res, err := f.Await()
  55. if err != nil {
  56. t.Fatalf("unexpected error: %v", err)
  57. }
  58. return res
  59. }
  60. func requireValue(t *testing.T, results []*source.GPUSaturationResult, want float64) {
  61. t.Helper()
  62. if len(results) != 1 {
  63. t.Fatalf("expected 1 result, got %d", len(results))
  64. }
  65. got := results[0].Data[0].Value
  66. if math.Abs(got-want) > 1e-9 {
  67. t.Errorf("value = %v, want %v", got, want)
  68. }
  69. if results[0].UUID != "GPU-1" || results[0].Container != "container1" {
  70. t.Errorf("result lost GPU identity labels: %+v", results[0])
  71. }
  72. }
  73. func TestCollectorMetricsQuerier_GPUThrottleViolationRatio(t *testing.T) {
  74. provider, start, end := gpuSaturationMockProvider(t)
  75. c := collectorMetricsQuerier{collectorProvider: provider}
  76. results := awaitGPUSaturation(t, c.QueryGPUThrottleViolationRatio(start, end))
  77. // only the power violation counter was scraped
  78. requireValue(t, results, 0.5)
  79. if results[0].Reason != opencost.GPUThrottleViolationPower {
  80. t.Errorf("Reason = %q, want %q", results[0].Reason, opencost.GPUThrottleViolationPower)
  81. }
  82. }
  83. func TestCollectorMetricsQuerier_GPUThrottleReasonRatio(t *testing.T) {
  84. provider, start, end := gpuSaturationMockProvider(t)
  85. c := collectorMetricsQuerier{collectorProvider: provider}
  86. results := awaitGPUSaturation(t, c.QueryGPUThrottleReasonRatio(start, end))
  87. // the legacy bitmask field was scraped, so every reason bit reports
  88. got := map[string]float64{}
  89. for _, res := range results {
  90. got[res.Reason] = res.Data[0].Value
  91. }
  92. if len(got) != len(opencost.GPUThrottleReasons) {
  93. t.Fatalf("expected %d reasons, got %d: %v", len(opencost.GPUThrottleReasons), len(got), got)
  94. }
  95. for _, reason := range opencost.GPUThrottleReasons {
  96. want := 0.0
  97. if reason.Name == opencost.GPUThrottleReasonSwPowerCap {
  98. want = 0.5
  99. }
  100. if math.Abs(got[reason.Name]-want) > 1e-9 {
  101. t.Errorf("reason %q ratio = %v, want %v", reason.Name, got[reason.Name], want)
  102. }
  103. }
  104. }
  105. func TestCollectorMetricsQuerier_GPUMemoryUsedRatio(t *testing.T) {
  106. provider, start, end := gpuSaturationMockProvider(t)
  107. c := collectorMetricsQuerier{collectorProvider: provider}
  108. // avg of per-sample ratios (0.5, 0.75)
  109. requireValue(t, awaitGPUSaturation(t, c.QueryGPUMemoryUsedRatioAvg(start, end)), 0.625)
  110. // max of per-sample ratios
  111. requireValue(t, awaitGPUSaturation(t, c.QueryGPUMemoryUsedRatioMax(start, end)), 0.75)
  112. }
  113. func TestCollectorMetricsQuerier_GPUMemoryPressureRatio(t *testing.T) {
  114. provider, start, end := gpuSaturationMockProvider(t)
  115. c := collectorMetricsQuerier{collectorProvider: provider}
  116. // threshold configured to 0.6: one of two samples (0.75) is at or above
  117. requireValue(t, awaitGPUSaturation(t, c.QueryGPUMemoryPressureRatio(start, end)), 0.5)
  118. }
  119. func TestCollectorMetricsQuerier_GPUXIDErrorCount(t *testing.T) {
  120. provider, start, end := gpuSaturationMockProvider(t)
  121. c := collectorMetricsQuerier{collectorProvider: provider}
  122. requireValue(t, awaitGPUSaturation(t, c.QueryGPUXIDErrorCount(start, end)), 1)
  123. }
  124. func TestCollectorMetricsQuerier_GPUDRAMActive(t *testing.T) {
  125. provider, start, end := gpuSaturationMockProvider(t)
  126. c := collectorMetricsQuerier{collectorProvider: provider}
  127. requireValue(t, awaitGPUSaturation(t, c.QueryGPUDRAMActiveAvg(start, end)), 0.6)
  128. requireValue(t, awaitGPUSaturation(t, c.QueryGPUDRAMActiveMax(start, end)), 0.7)
  129. }
  130. func TestCollectorMetricsQuerier_GPUPCIeTxBytesAvg(t *testing.T) {
  131. provider, start, end := gpuSaturationMockProvider(t)
  132. c := collectorMetricsQuerier{collectorProvider: provider}
  133. requireValue(t, awaitGPUSaturation(t, c.QueryGPUPCIeTxBytesAvg(start, end)), 1e9)
  134. }
  135. // TestCollectorMetricsQuerier_GPUDeviceMetrics verifies the device-level
  136. // queries aggregate from the device-labeled DCGM series.
  137. func TestCollectorMetricsQuerier_GPUDeviceMetrics(t *testing.T) {
  138. start, _ := time.Parse(time.RFC3339, Start1Str)
  139. end, _ := time.Parse(time.RFC3339, End1Str)
  140. deviceInfo := map[string]string{
  141. source.DeviceLabel: "nvidia0",
  142. source.ModelNameLabel: "Tesla T4",
  143. source.UUIDLabel: "GPU-1",
  144. source.MIGProfileLabel: "",
  145. source.MIGInstanceLabel: "",
  146. }
  147. store := NewOpenCostMetricStore()
  148. store.Update(metric.DCGMFIDEVPOWERUSAGE, deviceInfo, 120, start, nil)
  149. store.Update(metric.DCGMFIDEVPOWERUSAGE, deviceInfo, 160, end, nil)
  150. store.Update(metric.DCGMFIDEVGPUTEMP, deviceInfo, 55, start, nil)
  151. store.Update(metric.DCGMFIPROFGRENGINEACTIVE, deviceInfo, 0.4, start, nil)
  152. store.Update(metric.DCGMFIPROFGRENGINEACTIVE, deviceInfo, 0.9, end, nil)
  153. store.Update(metric.DCGMFIDEVFBUSED, deviceInfo, 1024, start, nil)
  154. store.Update(metric.DCGMFIDEVFBUSED, deviceInfo, 2048, end, nil)
  155. c := collectorMetricsQuerier{collectorProvider: &MockStoreProvider{metricsCollector: store}}
  156. checks := map[string]struct {
  157. future *source.Future[source.GPUDeviceMetricResult]
  158. want float64
  159. }{
  160. "power avg": {c.QueryGPUDevicePowerAvg(start, end), 140},
  161. "temp avg": {c.QueryGPUDeviceTempAvg(start, end), 55},
  162. "usage avg": {c.QueryGPUDeviceUsageAvg(start, end), 0.65},
  163. "usage max": {c.QueryGPUDeviceUsageMax(start, end), 0.9},
  164. "memory avg": {c.QueryGPUDeviceMemoryUsedAvg(start, end), 1536},
  165. "memory max": {c.QueryGPUDeviceMemoryUsedMax(start, end), 2048},
  166. }
  167. for name, check := range checks {
  168. results := awaitGPUSaturation(t, check.future)
  169. if len(results) != 1 {
  170. t.Fatalf("%s: expected 1 result, got %d", name, len(results))
  171. }
  172. if got := results[0].Data[0].Value; math.Abs(got-check.want) > 1e-9 {
  173. t.Errorf("%s = %v, want %v", name, got, check.want)
  174. }
  175. if results[0].UUID != "GPU-1" {
  176. t.Errorf("%s: lost device identity: %+v", name, results[0])
  177. }
  178. }
  179. }
  180. // TestCollectorMetricsQuerier_GPUSaturationAbsentSignals verifies that
  181. // signals whose DCGM fields were never scraped return no results instead of
  182. // zeroes.
  183. func TestCollectorMetricsQuerier_GPUSaturationAbsentSignals(t *testing.T) {
  184. provider, start, end := gpuSaturationMockProvider(t)
  185. c := collectorMetricsQuerier{collectorProvider: provider}
  186. absent := map[string]*source.Future[source.GPUSaturationResult]{
  187. "SMActiveAvg": c.QueryGPUSMActiveAvg(start, end),
  188. "SMOccupancyAvg": c.QueryGPUSMOccupancyAvg(start, end),
  189. "PCIeRxBytesAvg": c.QueryGPUPCIeRxBytesAvg(start, end),
  190. "NVLinkTxBytesAvg": c.QueryGPUNVLinkTxBytesAvg(start, end),
  191. "NVLinkRxBytesAvg": c.QueryGPUNVLinkRxBytesAvg(start, end),
  192. }
  193. for name, future := range absent {
  194. if results := awaitGPUSaturation(t, future); len(results) != 0 {
  195. t.Errorf("%s: expected no results for unscraped metric, got %d", name, len(results))
  196. }
  197. }
  198. }