gpusaturationquerier.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. package prom
  2. import (
  3. "fmt"
  4. "strings"
  5. "time"
  6. "github.com/opencost/opencost/core/pkg/log"
  7. "github.com/opencost/opencost/core/pkg/opencost"
  8. "github.com/opencost/opencost/core/pkg/source"
  9. "github.com/opencost/opencost/core/pkg/util/timeutil"
  10. )
  11. // GPU saturation queries
  12. //
  13. // These queries derive USE-method saturation signals for GPUs from
  14. // dcgm-exporter metrics. Each query is an independent primitive; when the
  15. // underlying DCGM field is not collected (no dcgm-exporter, the field is
  16. // disabled in its config, or the GPU lacks DCP profiling support) the query
  17. // simply returns no series, which downstream code treats as "signal absent"
  18. // rather than zero.
  19. //
  20. // Sources by dcgm-exporter configuration:
  21. // - default config: DCGM_FI_DEV_*_VIOLATION, DCGM_FI_DEV_FB_USED/FREE,
  22. // DCGM_FI_DEV_XID_ERRORS, DCGM_FI_PROF_DRAM_ACTIVE,
  23. // DCGM_FI_PROF_PCIE_TX/RX_BYTES
  24. // - requires explicit enablement: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS
  25. // (DCGM_FI_DEV_CLOCKS_EVENT_REASONS in DCGM 3.3+),
  26. // DCGM_FI_PROF_SM_ACTIVE, DCGM_FI_PROF_SM_OCCUPANCY,
  27. // DCGM_FI_PROF_NVLINK_TX/RX_BYTES
  28. //
  29. // All DCGM_FI_PROF_* fields additionally require DCP profiling (Volta+).
  30. // gpuSaturationByLabels is the grouping shared by every GPU saturation
  31. // query: container attribution, GPU identity, and MIG instance labels.
  32. // Grouping by a label that is absent from a series yields an empty value,
  33. // so non-MIG series pass through unchanged.
  34. const gpuSaturationByLabels = `container, pod, namespace, device, modelName, UUID, GPU_I_PROFILE, GPU_I_ID, pod_uid`
  35. // gpuThrottleViolationMetrics maps each DCGM violation counter to its
  36. // canonical reason name. The counters accumulate microseconds spent
  37. // throttled, so increase(counter[window]) / window-in-microseconds is the
  38. // fraction of the window spent throttled for that reason.
  39. var gpuThrottleViolationMetrics = []struct {
  40. Metric string
  41. Reason string
  42. }{
  43. {Metric: "DCGM_FI_DEV_POWER_VIOLATION", Reason: opencost.GPUThrottleViolationPower},
  44. {Metric: "DCGM_FI_DEV_THERMAL_VIOLATION", Reason: opencost.GPUThrottleViolationThermal},
  45. {Metric: "DCGM_FI_DEV_SYNC_BOOST_VIOLATION", Reason: opencost.GPUThrottleViolationSyncBoost},
  46. {Metric: "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION", Reason: opencost.GPUThrottleViolationBoardLimit},
  47. }
  48. // buildGPUThrottleViolationQuery returns one query producing a series per
  49. // (GPU, container, reason): the fraction of the window each violation
  50. // counter accumulated. Branches for the four violation metrics are joined
  51. // with "or" and tagged with a constant "reason" label so a single decoder
  52. // handles all of them.
  53. func buildGPUThrottleViolationQuery(clusterFilter, durStr, clusterLabel string, windowSeconds float64) string {
  54. windowMicros := windowSeconds * 1e6
  55. branches := make([]string, 0, len(gpuThrottleViolationMetrics))
  56. for _, violation := range gpuThrottleViolationMetrics {
  57. branches = append(branches, fmt.Sprintf(
  58. `label_replace(avg(increase(%s{container!="",%s}[%s])) by (%s, %s) / %g, "reason", "%s", "", "")`,
  59. violation.Metric, clusterFilter, durStr, gpuSaturationByLabels, clusterLabel, windowMicros, violation.Reason,
  60. ))
  61. }
  62. return strings.Join(branches, " or ")
  63. }
  64. // buildGPUThrottleReasonQuery returns one query producing a series per
  65. // (GPU, container, reason): the fraction of window samples in which the
  66. // reason bit was set in the clock throttle reasons bitmask. The bit test
  67. // floor(mask / bit) % 2 is evaluated per sample via a subquery at the
  68. // configured resolution (PromQL has no bitwise operators, hence the
  69. // arithmetic). Both the pre-3.3 and post-3.3 DCGM field names are queried
  70. // via `or`; at most one exists per dcgm-exporter version. The rename is
  71. // handled here at the query layer because the prometheus source has no
  72. // ingest step where the metric name could be normalized once — Prometheus
  73. // stores whatever dcgm-exporter exposed. If a normalization layer ever
  74. // exists, collapse these to the canonical name there.
  75. func buildGPUThrottleReasonQuery(clusterFilter, durStr, clusterLabel string, minsPerResolution int) string {
  76. branches := make([]string, 0, len(opencost.GPUThrottleReasons))
  77. for _, reason := range opencost.GPUThrottleReasons {
  78. branches = append(branches, fmt.Sprintf(
  79. `label_replace(avg(avg_over_time(((floor((DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{container!="",%s} or DCGM_FI_DEV_CLOCKS_EVENT_REASONS{container!="",%s}) / %d)) %% 2)[%s:%dm])) by (%s, %s), "reason", "%s", "", "")`,
  80. clusterFilter, clusterFilter, reason.Bit, durStr, minsPerResolution, gpuSaturationByLabels, clusterLabel, reason.Name,
  81. ))
  82. }
  83. return strings.Join(branches, " or ")
  84. }
  85. // queryGPUSaturation centralizes the shared shape of every GPU saturation
  86. // query method: log it, then issue it at the window end.
  87. func (pds *PrometheusMetricsQuerier) queryGPUSaturation(queryName, query string, end time.Time) *source.Future[source.GPUSaturationResult] {
  88. log.Debugf(PrometheusMetricsQueryLogFormat, queryName, end.Unix(), query)
  89. ctx := pds.promContexts.NewNamedContext(AllocationContextName)
  90. return source.NewFuture(source.DecodeGPUSaturationResult, ctx.QueryAtTime(query, end))
  91. }
  92. // mustDurationString panics like every other querier when the window cannot
  93. // be expressed as a duration string.
  94. func mustDurationString(queryName string, start, end time.Time) string {
  95. durStr := timeutil.DurationString(end.Sub(start))
  96. if durStr == "" {
  97. panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
  98. }
  99. return durStr
  100. }
  101. // QueryGPUThrottleViolationRatio queries the fraction of the window each GPU
  102. // spent throttled, per reason, from the DCGM violation counters (default
  103. // dcgm-exporter configuration).
  104. func (pds *PrometheusMetricsQuerier) QueryGPUThrottleViolationRatio(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  105. const queryName = "QueryGPUThrottleViolationRatio"
  106. cfg := pds.promConfig
  107. durStr := mustDurationString(queryName, start, end)
  108. query := buildGPUThrottleViolationQuery(cfg.ClusterFilter, durStr, cfg.ClusterLabel, end.Sub(start).Seconds())
  109. return pds.queryGPUSaturation(queryName, query, end)
  110. }
  111. // QueryGPUThrottleReasonRatio queries the fraction of the window each
  112. // saturation-relevant bit of the clock throttle reasons bitmask was set.
  113. // Requires DCGM_FI_DEV_CLOCK_THROTTLE_REASONS (or its DCGM 3.3+ rename) to
  114. // be enabled in the dcgm-exporter configuration.
  115. func (pds *PrometheusMetricsQuerier) QueryGPUThrottleReasonRatio(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  116. const queryName = "QueryGPUThrottleReasonRatio"
  117. cfg := pds.promConfig
  118. minsPerResolution := cfg.DataResolutionMinutes
  119. durStr := pds.durationStringFor(start, end, minsPerResolution, false)
  120. if durStr == "" {
  121. panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
  122. }
  123. query := buildGPUThrottleReasonQuery(cfg.ClusterFilter, durStr, cfg.ClusterLabel, minsPerResolution)
  124. return pds.queryGPUSaturation(queryName, query, end)
  125. }
  126. // QueryGPUMemoryUsedRatioAvg queries average framebuffer occupancy over the
  127. // window: FB_USED / (FB_USED + FB_FREE). Default dcgm-exporter configuration.
  128. func (pds *PrometheusMetricsQuerier) QueryGPUMemoryUsedRatioAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  129. const queryName = "QueryGPUMemoryUsedRatioAvg"
  130. const queryFmt = `avg(avg_over_time(DCGM_FI_DEV_FB_USED{container!="",%s}[%s])) by (%s, %s) / (avg(avg_over_time(DCGM_FI_DEV_FB_USED{container!="",%s}[%s])) by (%s, %s) + avg(avg_over_time(DCGM_FI_DEV_FB_FREE{container!="",%s}[%s])) by (%s, %s))`
  131. cfg := pds.promConfig
  132. durStr := mustDurationString(queryName, start, end)
  133. query := fmt.Sprintf(queryFmt,
  134. cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
  135. cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
  136. cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
  137. )
  138. return pds.queryGPUSaturation(queryName, query, end)
  139. }
  140. // QueryGPUMemoryUsedRatioMax queries peak framebuffer occupancy over the
  141. // window. The denominator uses window averages, which is exact because
  142. // FB_USED + FB_FREE is the fixed framebuffer capacity.
  143. func (pds *PrometheusMetricsQuerier) QueryGPUMemoryUsedRatioMax(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  144. const queryName = "QueryGPUMemoryUsedRatioMax"
  145. const queryFmt = `max(max_over_time(DCGM_FI_DEV_FB_USED{container!="",%s}[%s])) by (%s, %s) / (avg(avg_over_time(DCGM_FI_DEV_FB_USED{container!="",%s}[%s])) by (%s, %s) + avg(avg_over_time(DCGM_FI_DEV_FB_FREE{container!="",%s}[%s])) by (%s, %s))`
  146. cfg := pds.promConfig
  147. durStr := mustDurationString(queryName, start, end)
  148. query := fmt.Sprintf(queryFmt,
  149. cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
  150. cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
  151. cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel,
  152. )
  153. return pds.queryGPUSaturation(queryName, query, end)
  154. }
  155. // QueryGPUMemoryPressureRatio queries the fraction of window samples in
  156. // which framebuffer occupancy exceeded the configured threshold.
  157. func (pds *PrometheusMetricsQuerier) QueryGPUMemoryPressureRatio(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  158. const queryName = "QueryGPUMemoryPressureRatio"
  159. const queryFmt = `avg(avg_over_time(((DCGM_FI_DEV_FB_USED{container!="",%s} / (DCGM_FI_DEV_FB_USED{container!="",%s} + DCGM_FI_DEV_FB_FREE{container!="",%s})) >= bool %g)[%s:%dm])) by (%s, %s)`
  160. cfg := pds.promConfig
  161. minsPerResolution := cfg.DataResolutionMinutes
  162. durStr := pds.durationStringFor(start, end, minsPerResolution, false)
  163. if durStr == "" {
  164. panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
  165. }
  166. query := fmt.Sprintf(queryFmt,
  167. cfg.ClusterFilter, cfg.ClusterFilter, cfg.ClusterFilter,
  168. cfg.GPUMemorySaturationThreshold, durStr, minsPerResolution,
  169. gpuSaturationByLabels, cfg.ClusterLabel,
  170. )
  171. return pds.queryGPUSaturation(queryName, query, end)
  172. }
  173. // QueryGPUXIDErrorCount queries the number of XID error events observed in
  174. // the window. DCGM_FI_DEV_XID_ERRORS reports the most recent XID code, so
  175. // this counts changes of that value; consecutive identical errors are
  176. // undercounted. Default dcgm-exporter configuration.
  177. func (pds *PrometheusMetricsQuerier) QueryGPUXIDErrorCount(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  178. const queryName = "QueryGPUXIDErrorCount"
  179. const queryFmt = `sum(changes(DCGM_FI_DEV_XID_ERRORS{container!="",%s}[%s])) by (%s, %s)`
  180. cfg := pds.promConfig
  181. durStr := mustDurationString(queryName, start, end)
  182. query := fmt.Sprintf(queryFmt, cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel)
  183. return pds.queryGPUSaturation(queryName, query, end)
  184. }
  185. // queryGPUGaugeOverTime is the shared implementation for the DCP profiling
  186. // gauges that need only an avg or max over the window.
  187. func (pds *PrometheusMetricsQuerier) queryGPUGaugeOverTime(queryName, metric, agg string, start, end time.Time) *source.Future[source.GPUSaturationResult] {
  188. const queryFmt = `%s(%s_over_time(%s{container!="",%s}[%s])) by (%s, %s)`
  189. cfg := pds.promConfig
  190. durStr := mustDurationString(queryName, start, end)
  191. query := fmt.Sprintf(queryFmt, agg, agg, metric, cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel)
  192. return pds.queryGPUSaturation(queryName, query, end)
  193. }
  194. // queryGPUCounterRate is the shared implementation for the DCP byte
  195. // counters reported as average bytes/sec over the window.
  196. func (pds *PrometheusMetricsQuerier) queryGPUCounterRate(queryName, metric string, start, end time.Time) *source.Future[source.GPUSaturationResult] {
  197. const queryFmt = `avg(rate(%s{container!="",%s}[%s])) by (%s, %s)`
  198. cfg := pds.promConfig
  199. durStr := mustDurationString(queryName, start, end)
  200. query := fmt.Sprintf(queryFmt, metric, cfg.ClusterFilter, durStr, gpuSaturationByLabels, cfg.ClusterLabel)
  201. return pds.queryGPUSaturation(queryName, query, end)
  202. }
  203. // QueryGPUDRAMActiveAvg queries the average ratio of cycles the device
  204. // memory interface was active. Requires DCP profiling.
  205. func (pds *PrometheusMetricsQuerier) QueryGPUDRAMActiveAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  206. return pds.queryGPUGaugeOverTime("QueryGPUDRAMActiveAvg", "DCGM_FI_PROF_DRAM_ACTIVE", "avg", start, end)
  207. }
  208. // QueryGPUDRAMActiveMax queries the peak ratio of cycles the device memory
  209. // interface was active. Requires DCP profiling.
  210. func (pds *PrometheusMetricsQuerier) QueryGPUDRAMActiveMax(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  211. return pds.queryGPUGaugeOverTime("QueryGPUDRAMActiveMax", "DCGM_FI_PROF_DRAM_ACTIVE", "max", start, end)
  212. }
  213. // QueryGPUSMActiveAvg queries the average ratio of cycles at least one warp
  214. // was assigned to any SM. Requires DCP profiling and explicit enablement in
  215. // the dcgm-exporter configuration.
  216. func (pds *PrometheusMetricsQuerier) QueryGPUSMActiveAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  217. return pds.queryGPUGaugeOverTime("QueryGPUSMActiveAvg", "DCGM_FI_PROF_SM_ACTIVE", "avg", start, end)
  218. }
  219. // QueryGPUSMOccupancyAvg queries the average ratio of resident warps to the
  220. // SM maximum. Requires DCP profiling and explicit enablement in the
  221. // dcgm-exporter configuration.
  222. func (pds *PrometheusMetricsQuerier) QueryGPUSMOccupancyAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  223. return pds.queryGPUGaugeOverTime("QueryGPUSMOccupancyAvg", "DCGM_FI_PROF_SM_OCCUPANCY", "avg", start, end)
  224. }
  225. // QueryGPUPCIeTxBytesAvg queries average PCIe transmit throughput in
  226. // bytes/sec. Requires DCP profiling.
  227. func (pds *PrometheusMetricsQuerier) QueryGPUPCIeTxBytesAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  228. return pds.queryGPUCounterRate("QueryGPUPCIeTxBytesAvg", "DCGM_FI_PROF_PCIE_TX_BYTES", start, end)
  229. }
  230. // QueryGPUPCIeRxBytesAvg queries average PCIe receive throughput in
  231. // bytes/sec. Requires DCP profiling.
  232. func (pds *PrometheusMetricsQuerier) QueryGPUPCIeRxBytesAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  233. return pds.queryGPUCounterRate("QueryGPUPCIeRxBytesAvg", "DCGM_FI_PROF_PCIE_RX_BYTES", start, end)
  234. }
  235. // QueryGPUNVLinkTxBytesAvg queries average NVLink transmit throughput in
  236. // bytes/sec. Requires DCP profiling and explicit enablement in the
  237. // dcgm-exporter configuration.
  238. func (pds *PrometheusMetricsQuerier) QueryGPUNVLinkTxBytesAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  239. return pds.queryGPUCounterRate("QueryGPUNVLinkTxBytesAvg", "DCGM_FI_PROF_NVLINK_TX_BYTES", start, end)
  240. }
  241. // QueryGPUNVLinkRxBytesAvg queries average NVLink receive throughput in
  242. // bytes/sec. Requires DCP profiling and explicit enablement in the
  243. // dcgm-exporter configuration.
  244. func (pds *PrometheusMetricsQuerier) QueryGPUNVLinkRxBytesAvg(start, end time.Time) *source.Future[source.GPUSaturationResult] {
  245. return pds.queryGPUCounterRate("QueryGPUNVLinkRxBytesAvg", "DCGM_FI_PROF_NVLINK_RX_BYTES", start, end)
  246. }