device_interface.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. package kubemodel
  2. import "time"
  3. // Accelerator devices are modeled as per-vendor concrete types (e.g.
  4. // DCGMDevice for NVIDIA) behind small interfaces rather than one generic
  5. // concrete struct. A generic type forces every vendor's changes into shared
  6. // code, implies relational fields not all vendors can provide, and blurs
  7. // semantics that differ between vendors (e.g. what "power" means for AMD vs
  8. // NVIDIA). Overly specific shared models have the opposite failure:
  9. // collecting more than is used and looking half-supported for vendors
  10. // missing a feature. Vendor structs keep agent-side transformation minimal
  11. // (data ships close to its source shape, e.g. DCGM for NVIDIA), while these
  12. // interfaces provide the common surface for identification, utilization,
  13. // performance, and saturation.
  14. //
  15. // Capacity is intentionally absent: it is not readily available from vendor
  16. // telemetry (e.g. DCGM) and is inferred via device plugins / DRA, which are
  17. // assumed prerequisites and provide the request half of device allocation.
  18. // Fan speed, PCIe topology detail, and topology are out of scope.
  19. // DeviceInfo identifies a physical or virtualized accelerator device and
  20. // its lifecycle within the observed window.
  21. type DeviceInfo interface {
  22. // GetIdentifier returns the device UUID. For virtualized devices
  23. // (e.g. a MIG instance) this is the UUID of the virtual device.
  24. GetIdentifier() string
  25. // GetType returns the human-readable device type, e.g. "GPU".
  26. GetType() string
  27. // GetName returns the marketing/model name, e.g. "NVIDIA A100 80GB".
  28. GetName() string
  29. // GetPower returns the device power draw in watts.
  30. GetPower() float64
  31. // GetStart returns the time the device came online in the observed
  32. // window.
  33. GetStart() time.Time
  34. // GetEnd returns the time the device's observation ended.
  35. GetEnd() time.Time
  36. // GetParent returns the UUID of the parent device, or empty if none.
  37. // For a MIG instance this is the UUID of the physical GPU.
  38. GetParent() string
  39. }
  40. // DevicePerformance exposes utilization and performance measures for an
  41. // accelerator device.
  42. type DevicePerformance interface {
  43. // GetComputeUtilizationAverage returns the average percentage of
  44. // compute used over the observed window.
  45. GetComputeUtilizationAverage() float64
  46. // GetComputeUtilizationMax returns the max percentage of compute used.
  47. GetComputeUtilizationMax() float64
  48. // GetMemoryUtilizationAverage returns the average absolute memory used
  49. // in bytes.
  50. GetMemoryUtilizationAverage() float64
  51. // GetMemoryUtilizationMax returns the max absolute memory used in
  52. // bytes.
  53. GetMemoryUtilizationMax() float64
  54. // GetTemp returns the current device temperature in degrees Celsius.
  55. GetTemp() float64
  56. }
  57. // DeviceSaturation exposes USE-method saturation measures for an
  58. // accelerator device: where DevicePerformance reports how busy the device
  59. // was, saturation reports work queued, rejected, or slowed because the
  60. // device could not service demand.
  61. //
  62. // Absence is meaningful and distinct from zero: scalar getters return
  63. // ok=false and map getters return nil when the vendor telemetry for that
  64. // signal was unavailable, so consumers never mistake "not measured" for
  65. // "not saturated". Names are vendor-neutral; the mapping to vendor
  66. // telemetry (e.g. DCGM fields for NVIDIA) lives in each implementation.
  67. // Ratios are fractions of the observed window in [0, 1].
  68. type DeviceSaturation interface {
  69. // GetThrottleViolationRatios returns the fraction of the window the
  70. // device spent throttled, keyed by vendor-defined reason name, from
  71. // cumulative time-in-violation counters. Nil when unavailable.
  72. GetThrottleViolationRatios() map[string]float64
  73. // GetThrottleReasonRatios returns the fraction of the window each
  74. // vendor-defined throttle reason state was active, from sampled state
  75. // flags. Nil when unavailable.
  76. GetThrottleReasonRatios() map[string]float64
  77. // GetMemoryUsedRatioAvg returns average device-memory occupancy.
  78. GetMemoryUsedRatioAvg() (float64, bool)
  79. // GetMemoryUsedRatioMax returns peak device-memory occupancy.
  80. GetMemoryUsedRatioMax() (float64, bool)
  81. // GetMemoryPressureRatio returns the fraction of the window
  82. // device-memory occupancy exceeded the configured threshold.
  83. GetMemoryPressureRatio() (float64, bool)
  84. // GetErrorEventCount returns the number of device error events
  85. // (rejected work) observed in the window.
  86. GetErrorEventCount() (float64, bool)
  87. // GetMemoryBandwidthActiveAvg returns the average ratio of cycles the
  88. // device memory interface was active.
  89. GetMemoryBandwidthActiveAvg() (float64, bool)
  90. // GetMemoryBandwidthActiveMax returns the peak ratio of cycles the
  91. // device memory interface was active.
  92. GetMemoryBandwidthActiveMax() (float64, bool)
  93. // GetComputeActiveAvg returns the average ratio of cycles compute
  94. // units had work resident. Together with GetComputeOccupancyAvg and
  95. // GetMemoryBandwidthActiveAvg, consumers can distinguish
  96. // compute-bound from bandwidth- or latency-bound saturation.
  97. GetComputeActiveAvg() (float64, bool)
  98. // GetComputeOccupancyAvg returns the average ratio of resident work
  99. // to the compute units' maximum.
  100. GetComputeOccupancyAvg() (float64, bool)
  101. // GetHostLinkTxBytesAvg/RxBytesAvg return average host-interconnect
  102. // throughput (e.g. PCIe) in bytes/sec. Link capacity is not derivable
  103. // from vendor telemetry, so these are rates, not ratios.
  104. GetHostLinkTxBytesAvg() (float64, bool)
  105. GetHostLinkRxBytesAvg() (float64, bool)
  106. // GetPeerLinkTxBytesAvg/RxBytesAvg return average device-to-device
  107. // fabric throughput (e.g. NVLink) in bytes/sec.
  108. GetPeerLinkTxBytesAvg() (float64, bool)
  109. GetPeerLinkRxBytesAvg() (float64, bool)
  110. }