2
0

gpudevicequerier.go 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. package prom
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/opencost/opencost/core/pkg/source"
  6. )
  7. // Device-level GPU metric queries supporting the DeviceInfo and
  8. // DevicePerformance contracts on DCGMDevice: power draw, temperature,
  9. // device-level compute utilization, and absolute memory used. Unlike the
  10. // container-attributed saturation queries these group by device identity
  11. // only, since the values describe the whole device (or MIG instance)
  12. // regardless of which containers share it. All source fields are in the
  13. // default dcgm-exporter configuration.
  14. // gpuDeviceByLabels groups series by device identity (and MIG instance)
  15. // without container attribution.
  16. const gpuDeviceByLabels = `device, modelName, UUID, GPU_I_PROFILE, GPU_I_ID`
  17. // queryGPUDeviceGauge issues an agg(agg_over_time(...)) query for a
  18. // device-level DCGM gauge.
  19. func (pds *PrometheusMetricsQuerier) queryGPUDeviceGauge(queryName, metric, agg string, start, end time.Time) *source.Future[source.GPUDeviceMetricResult] {
  20. const queryFmt = `%s(%s_over_time(%s{%s}[%s])) by (%s, %s)`
  21. cfg := pds.promConfig
  22. durStr := mustDurationString(queryName, start, end)
  23. query := fmt.Sprintf(queryFmt, agg, agg, metric, cfg.ClusterFilter, durStr, gpuDeviceByLabels, cfg.ClusterLabel)
  24. return pds.queryGPUSaturation(queryName, query, end)
  25. }
  26. // QueryGPUDevicePowerAvg queries average device power draw in watts
  27. // (DCGM_FI_DEV_POWER_USAGE).
  28. func (pds *PrometheusMetricsQuerier) QueryGPUDevicePowerAvg(start, end time.Time) *source.Future[source.GPUDeviceMetricResult] {
  29. return pds.queryGPUDeviceGauge("QueryGPUDevicePowerAvg", "DCGM_FI_DEV_POWER_USAGE", "avg", start, end)
  30. }
  31. // QueryGPUDeviceTempAvg queries average device temperature in degrees
  32. // Celsius (DCGM_FI_DEV_GPU_TEMP).
  33. func (pds *PrometheusMetricsQuerier) QueryGPUDeviceTempAvg(start, end time.Time) *source.Future[source.GPUDeviceMetricResult] {
  34. return pds.queryGPUDeviceGauge("QueryGPUDeviceTempAvg", "DCGM_FI_DEV_GPU_TEMP", "avg", start, end)
  35. }
  36. // QueryGPUDeviceUsageAvg queries average device-level compute utilization
  37. // as a 0-1 ratio (DCGM_FI_PROF_GR_ENGINE_ACTIVE without container
  38. // attribution).
  39. func (pds *PrometheusMetricsQuerier) QueryGPUDeviceUsageAvg(start, end time.Time) *source.Future[source.GPUDeviceMetricResult] {
  40. return pds.queryGPUDeviceGauge("QueryGPUDeviceUsageAvg", "DCGM_FI_PROF_GR_ENGINE_ACTIVE", "avg", start, end)
  41. }
  42. // QueryGPUDeviceUsageMax queries peak device-level compute utilization as a
  43. // 0-1 ratio.
  44. func (pds *PrometheusMetricsQuerier) QueryGPUDeviceUsageMax(start, end time.Time) *source.Future[source.GPUDeviceMetricResult] {
  45. return pds.queryGPUDeviceGauge("QueryGPUDeviceUsageMax", "DCGM_FI_PROF_GR_ENGINE_ACTIVE", "max", start, end)
  46. }
  47. // QueryGPUDeviceMemoryUsedAvg queries average framebuffer used in MiB
  48. // (DCGM_FI_DEV_FB_USED); hydration converts to bytes.
  49. func (pds *PrometheusMetricsQuerier) QueryGPUDeviceMemoryUsedAvg(start, end time.Time) *source.Future[source.GPUDeviceMetricResult] {
  50. return pds.queryGPUDeviceGauge("QueryGPUDeviceMemoryUsedAvg", "DCGM_FI_DEV_FB_USED", "avg", start, end)
  51. }
  52. // QueryGPUDeviceMemoryUsedMax queries peak framebuffer used in MiB.
  53. func (pds *PrometheusMetricsQuerier) QueryGPUDeviceMemoryUsedMax(start, end time.Time) *source.Future[source.GPUDeviceMetricResult] {
  54. return pds.queryGPUDeviceGauge("QueryGPUDeviceMemoryUsedMax", "DCGM_FI_DEV_FB_USED", "max", start, end)
  55. }