gpusaturation.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. package collector
  2. import (
  3. coreenv "github.com/opencost/opencost/core/pkg/env"
  4. "github.com/opencost/opencost/core/pkg/opencost"
  5. "github.com/opencost/opencost/core/pkg/source"
  6. "github.com/opencost/opencost/modules/collector-source/pkg/metric"
  7. "github.com/opencost/opencost/modules/collector-source/pkg/metric/aggregator"
  8. )
  9. // GPU saturation collectors
  10. //
  11. // These collectors aggregate USE-method GPU saturation signals from
  12. // dcgm-exporter scrapes. When a DCGM field is absent from the scrape (not in
  13. // the exporter config, or no DCP profiling support) the corresponding
  14. // collector simply accumulates nothing and its query returns no results,
  15. // which downstream treats as "signal absent" rather than zero.
  16. // gpuSaturationLabels is the label set shared by every GPU saturation
  17. // collector: container attribution, GPU identity, and MIG instance labels.
  18. // Labels absent from a scrape resolve to empty strings.
  19. var gpuSaturationLabels = []string{
  20. source.NamespaceLabel,
  21. source.PodLabel,
  22. source.PodUIDLabel,
  23. source.ContainerLabel,
  24. source.DeviceLabel,
  25. source.ModelNameLabel,
  26. source.UUIDLabel,
  27. source.MIGProfileLabel,
  28. source.MIGInstanceLabel,
  29. }
  30. func gpuSaturationFilter(labels map[string]string) bool {
  31. return labels[source.ContainerLabel] != ""
  32. }
  33. // gpuThrottleViolationCollectorMetrics maps each violation collector ID to
  34. // its DCGM source metric. The counters accumulate microseconds spent
  35. // throttled; the querier converts the windowed increase into a fraction of
  36. // the window and tags the reason.
  37. var gpuThrottleViolationCollectors = []struct {
  38. ID metric.MetricCollectorID
  39. Metric string
  40. Reason string
  41. }{
  42. {ID: metric.GPUThrottleViolationPowerID, Metric: metric.DCGMFIDEVPOWERVIOLATION, Reason: opencost.GPUThrottleViolationPower},
  43. {ID: metric.GPUThrottleViolationThermalID, Metric: metric.DCGMFIDEVTHERMALVIOLATION, Reason: opencost.GPUThrottleViolationThermal},
  44. {ID: metric.GPUThrottleViolationSyncBoostID, Metric: metric.DCGMFIDEVSYNCBOOSTVIOLATION, Reason: opencost.GPUThrottleViolationSyncBoost},
  45. {ID: metric.GPUThrottleViolationBoardLimitID, Metric: metric.DCGMFIDEVBOARDLIMITVIOLATION, Reason: opencost.GPUThrottleViolationBoardLimit},
  46. }
  47. // gpuThrottleBitmaskMetrics enumerates both names of the DCGM clock throttle
  48. // reasons bitmask field (renamed in DCGM 3.3+); at most one is scraped per
  49. // dcgm-exporter version, so only one family of collectors ever accumulates
  50. // data and the querier's merge of both ID families is effectively a union
  51. // with one empty side. Registering per-name collectors was chosen over
  52. // renaming at scrape time because the TargetScraper is a generic
  53. // name-filtered pipe with no transform hook; if scrape-time normalization
  54. // is ever added, collapse this to the canonical name and halve the
  55. // collectors.
  56. var gpuThrottleBitmaskMetrics = []string{
  57. metric.DCGMFIDEVCLOCKTHROTTLEREASONS,
  58. metric.DCGMFIDEVCLOCKSEVENTREASONS,
  59. }
  60. func newGPUSaturationCollector(id metric.MetricCollectorID, metricName string, factory aggregator.MetricAggregatorFactory) *metric.MetricCollector {
  61. return metric.NewMetricCollector(
  62. id,
  63. metricName,
  64. gpuSaturationLabels,
  65. factory,
  66. gpuSaturationFilter,
  67. )
  68. }
  69. // gpuDeviceLabels groups device-level metrics by device identity (and MIG
  70. // instance) without container attribution: power, temperature, and
  71. // device-level utilization describe the whole device regardless of which
  72. // containers share it.
  73. var gpuDeviceLabels = []string{
  74. source.DeviceLabel,
  75. source.ModelNameLabel,
  76. source.UUIDLabel,
  77. source.MIGProfileLabel,
  78. source.MIGInstanceLabel,
  79. }
  80. func newGPUDeviceCollector(id metric.MetricCollectorID, metricName string, factory aggregator.MetricAggregatorFactory) *metric.MetricCollector {
  81. return metric.NewMetricCollector(id, metricName, gpuDeviceLabels, factory, nil)
  82. }
  83. // NewGPUDeviceMetricCollectors returns the collectors backing the
  84. // DeviceInfo / DevicePerformance contracts: power, temperature,
  85. // device-level compute utilization, and framebuffer used.
  86. func NewGPUDeviceMetricCollectors() []*metric.MetricCollector {
  87. return []*metric.MetricCollector{
  88. newGPUDeviceCollector(metric.GPUDevicePowerAvgID, metric.DCGMFIDEVPOWERUSAGE, aggregator.AverageOverTime),
  89. newGPUDeviceCollector(metric.GPUDeviceTempAvgID, metric.DCGMFIDEVGPUTEMP, aggregator.AverageOverTime),
  90. newGPUDeviceCollector(metric.GPUDeviceUsageAvgID, metric.DCGMFIPROFGRENGINEACTIVE, aggregator.AverageOverTime),
  91. newGPUDeviceCollector(metric.GPUDeviceUsageMaxID, metric.DCGMFIPROFGRENGINEACTIVE, aggregator.MaxOverTime),
  92. newGPUDeviceCollector(metric.GPUDeviceMemoryUsedAvgID, metric.DCGMFIDEVFBUSED, aggregator.AverageOverTime),
  93. newGPUDeviceCollector(metric.GPUDeviceMemoryUsedMaxID, metric.DCGMFIDEVFBUSED, aggregator.MaxOverTime),
  94. }
  95. }
  96. // NewGPUSaturationMetricCollectors returns every collector needed for the
  97. // GPU saturation signals.
  98. func NewGPUSaturationMetricCollectors() []*metric.MetricCollector {
  99. collectors := []*metric.MetricCollector{
  100. // framebuffer occupancy over the synthetic per-sample ratio metric
  101. // joined from FB_USED/FB_FREE at scrape time (see
  102. // synthetic.GPUMemoryUsedRatioSynthesizer)
  103. newGPUSaturationCollector(metric.GPUMemoryUsedAvgID, metric.OpencostGPUMemoryUsedRatio, aggregator.AverageOverTime),
  104. newGPUSaturationCollector(metric.GPUMemoryUsedMaxID, metric.OpencostGPUMemoryUsedRatio, aggregator.MaxOverTime),
  105. newGPUSaturationCollector(metric.GPUMemoryPressureRatioID, metric.OpencostGPUMemoryUsedRatio, aggregator.AboveThresholdRatio(coreenv.GetGPUMemorySaturationThreshold())),
  106. // XID error events: count value transitions of the last-error gauge
  107. newGPUSaturationCollector(metric.GPUXIDErrorCountID, metric.DCGMFIDEVXIDERRORS, aggregator.Changes),
  108. // DCP profiling gauges
  109. newGPUSaturationCollector(metric.GPUDRAMActiveAvgID, metric.DCGMFIPROFDRAMACTIVE, aggregator.AverageOverTime),
  110. newGPUSaturationCollector(metric.GPUDRAMActiveMaxID, metric.DCGMFIPROFDRAMACTIVE, aggregator.MaxOverTime),
  111. newGPUSaturationCollector(metric.GPUSMActiveAvgID, metric.DCGMFIPROFSMACTIVE, aggregator.AverageOverTime),
  112. newGPUSaturationCollector(metric.GPUSMOccupancyAvgID, metric.DCGMFIPROFSMOCCUPANCY, aggregator.AverageOverTime),
  113. // DCP byte counters as average bytes/sec
  114. newGPUSaturationCollector(metric.GPUPCIeTxBytesAvgID, metric.DCGMFIPROFPCIETXBYTES, aggregator.Rate),
  115. newGPUSaturationCollector(metric.GPUPCIeRxBytesAvgID, metric.DCGMFIPROFPCIERXBYTES, aggregator.Rate),
  116. newGPUSaturationCollector(metric.GPUNVLinkTxBytesAvgID, metric.DCGMFIPROFNVLINKTXBYTES, aggregator.Rate),
  117. newGPUSaturationCollector(metric.GPUNVLinkRxBytesAvgID, metric.DCGMFIPROFNVLINKRXBYTES, aggregator.Rate),
  118. }
  119. // throttle violation counters: windowed increase, normalized by the
  120. // querier
  121. for _, violation := range gpuThrottleViolationCollectors {
  122. collectors = append(collectors, newGPUSaturationCollector(violation.ID, violation.Metric, aggregator.Increase))
  123. }
  124. // throttle reasons bitmask: one bit-ratio collector per
  125. // (metric name, saturation-relevant reason)
  126. for _, metricName := range gpuThrottleBitmaskMetrics {
  127. for _, reason := range opencost.GPUThrottleReasons {
  128. collectors = append(collectors, newGPUSaturationCollector(
  129. metric.GPUThrottleReasonCollectorID(metricName, reason.Name),
  130. metricName,
  131. aggregator.BitSetRatio(reason.Bit),
  132. ))
  133. }
  134. }
  135. return collectors
  136. }