| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197 |
- package kubemodel
- import (
- "fmt"
- "time"
- "maps"
- )
- // DCGMDevice holds recording from the DCGM exporter which provides identification and usage metrics for
- // Nvidia gpu. These Nvidia devices can be incorporated into the cluster via k8s Device Plugin API or DRAs.
- // While the DCGM exporter does provide unique identifiers for the containers that it is reporting metrics on,
- // It is split out here to provide some isolate from the rest of the KubeModel which represent universal structures
- // from the k8s API. It is left to the end user to interpret the relationships to the rest of the cluster based on
- // container unique identifiers
- // @bingen:generate:DCGMDevice
- type DCGMDevice struct {
- UUID string `json:"uuid"`
- Start time.Time `json:"start"`
- End time.Time `json:"end"`
- Device string `json:"device"`
- ModelName string `json:"modelName"`
- PodUsages map[string]DCGMPod `json:"podUsages"`
- // USE-method saturation signals for this device, nil when no
- // saturation metrics were available (see DCGMDeviceSaturation)
- Saturation *DCGMDeviceSaturation `json:"saturation,omitempty"` // @bingen:field[version=3]
- // Device-level metrics from the default dcgm-exporter configuration.
- // Compute utilization is a percentage (0-100); memory used is absolute
- // bytes. Zero means not collected for the window.
- PowerWatts float64 `json:"powerWatts,omitempty"` // @bingen:field[version=4]
- TemperatureCelsius float64 `json:"temperatureCelsius,omitempty"` // @bingen:field[version=4]
- ComputeUtilizationAvg float64 `json:"computeUtilizationAvg,omitempty"` // @bingen:field[version=4]
- ComputeUtilizationMax float64 `json:"computeUtilizationMax,omitempty"` // @bingen:field[version=4]
- MemoryUsedBytesAvg float64 `json:"memoryUsedBytesAvg,omitempty"` // @bingen:field[version=4]
- MemoryUsedBytesMax float64 `json:"memoryUsedBytesMax,omitempty"` // @bingen:field[version=4]
- }
- var (
- _ DeviceInfo = (*DCGMDevice)(nil)
- _ DevicePerformance = (*DCGMDevice)(nil)
- _ DeviceSaturation = (*DCGMDevice)(nil)
- )
- // DeviceInfo implementation. MIG parentage is not derivable from
- // dcgm-exporter labels (a MIG instance's series do not carry the physical
- // GPU UUID), so GetParent reports empty until a source for the mapping
- // exists (e.g. DRA slice attributes).
- func (d *DCGMDevice) GetIdentifier() string { return d.UUID }
- func (d *DCGMDevice) GetType() string { return "GPU" }
- func (d *DCGMDevice) GetName() string { return d.ModelName }
- func (d *DCGMDevice) GetPower() float64 { return d.PowerWatts }
- func (d *DCGMDevice) GetStart() time.Time { return d.Start }
- func (d *DCGMDevice) GetEnd() time.Time { return d.End }
- func (d *DCGMDevice) GetParent() string { return "" }
- // DevicePerformance implementation, backed by device-level DCGM series:
- // GR_ENGINE_ACTIVE (scaled to percent), FB_USED (scaled to bytes), and
- // GPU_TEMP.
- func (d *DCGMDevice) GetComputeUtilizationAverage() float64 { return d.ComputeUtilizationAvg }
- func (d *DCGMDevice) GetComputeUtilizationMax() float64 { return d.ComputeUtilizationMax }
- func (d *DCGMDevice) GetMemoryUtilizationAverage() float64 { return d.MemoryUsedBytesAvg }
- func (d *DCGMDevice) GetMemoryUtilizationMax() float64 { return d.MemoryUsedBytesMax }
- func (d *DCGMDevice) GetTemp() float64 { return d.TemperatureCelsius }
- // DeviceSaturation implementation. The vendor-neutral getters map onto
- // DCGM concepts: throttle violation counters DCGM_FI_DEV_*_VIOLATION,
- // throttle reason bitmask DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, framebuffer
- // occupancy FB_USED/(FB_USED+FB_FREE), error events = XID errors, memory
- // bandwidth = DRAM_ACTIVE, compute active/occupancy = SM_ACTIVE /
- // SM_OCCUPANCY, host link = PCIe, peer link = NVLink. ok=false / nil means
- // the DCGM field was unavailable, never zero.
- // saturationValue adapts the saturation struct's pointer fields to the
- // interface's (value, ok) presence contract, including when Saturation is
- // entirely nil.
- func (d *DCGMDevice) saturationValue(get func(*DCGMDeviceSaturation) *float64) (float64, bool) {
- if d.Saturation == nil {
- return 0, false
- }
- v := get(d.Saturation)
- if v == nil {
- return 0, false
- }
- return *v, true
- }
- func (d *DCGMDevice) GetThrottleViolationRatios() map[string]float64 {
- if d.Saturation == nil {
- return nil
- }
- return maps.Clone(d.Saturation.ThrottleViolationRatios)
- }
- func (d *DCGMDevice) GetThrottleReasonRatios() map[string]float64 {
- if d.Saturation == nil {
- return nil
- }
- return maps.Clone(d.Saturation.ThrottleReasonRatios)
- }
- func (d *DCGMDevice) GetMemoryUsedRatioAvg() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryUsedRatioAvg })
- }
- func (d *DCGMDevice) GetMemoryUsedRatioMax() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryUsedRatioMax })
- }
- func (d *DCGMDevice) GetMemoryPressureRatio() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryPressureRatio })
- }
- func (d *DCGMDevice) GetErrorEventCount() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.XIDErrorCount })
- }
- func (d *DCGMDevice) GetMemoryBandwidthActiveAvg() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.DRAMActiveAvg })
- }
- func (d *DCGMDevice) GetMemoryBandwidthActiveMax() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.DRAMActiveMax })
- }
- func (d *DCGMDevice) GetComputeActiveAvg() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.SMActiveAvg })
- }
- func (d *DCGMDevice) GetComputeOccupancyAvg() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.SMOccupancyAvg })
- }
- func (d *DCGMDevice) GetHostLinkTxBytesAvg() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.PCIeTxBytesAvg })
- }
- func (d *DCGMDevice) GetHostLinkRxBytesAvg() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.PCIeRxBytesAvg })
- }
- func (d *DCGMDevice) GetPeerLinkTxBytesAvg() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.NVLinkTxBytesAvg })
- }
- func (d *DCGMDevice) GetPeerLinkRxBytesAvg() (float64, bool) {
- return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.NVLinkRxBytesAvg })
- }
- // @bingen:generate:DCGMPod
- type DCGMPod struct {
- ContainerUsages map[string]DCGMContainer `json:"container-usages"`
- }
- // @bingen:generate:DCGMContainer
- type DCGMContainer struct {
- UsageAvg float64 `json:"usageAvg"`
- UsageMax float64 `json:"usageMax"`
- }
- func (d *DCGMDevice) ValidateDCGMDevice(window Window) error {
- if d.UUID == "" {
- return fmt.Errorf("UUID is missing for DCGMDevice with device '%s'", d.Device)
- }
- if err := checkWindow(window, d.Start, d.End); err != nil {
- return err
- }
- if err := d.Saturation.Validate(); err != nil {
- return fmt.Errorf("invalid Saturation for DCGMDevice '%s': %w", d.UUID, err)
- }
- return nil
- }
- // RegisterDCGMDevice validates and adds a DCGMDevice to the set, keyed by UUID.
- func (kms *KubeModelSet) RegisterDCGMDevice(device *DCGMDevice) error {
- if err := device.ValidateDCGMDevice(kms.Window); err != nil {
- err = fmt.Errorf("RegisterDCGMDevice: invalid dcgm device: %w", err)
- kms.Error(err)
- return err
- }
- if _, ok := kms.DCGMDevices[device.UUID]; !ok {
- if kms.Cluster == nil {
- kms.Warnf("RegisterDCGMDevice: Cluster is nil")
- }
- kms.DCGMDevices[device.UUID] = device
- kms.Metadata.ObjectCount++
- }
- return nil
- }
|