Pārlūkot izejas kodu

Hydrate DCGM device saturation in the kubemodel pipeline

Populate per-device GPU saturation and device telemetry on the kubemodel's
DCGMDevice, so the kubecost kubemodel emitter exposes USE-method GPU
signals end to end through the native collectors. This is the consumer of
the saturation/device query layer added previously.

- core/kubemodel: DCGMDevice gains a version-gated Saturation field
  (DCGMDeviceSaturation, codec field version 3) plus device telemetry
  fields - power, temperature, compute utilization avg/max, and device
  memory used avg/max (codec field version 4). The vendor-neutral
  DeviceInfo / DevicePerformance / DeviceSaturation interfaces give a
  per-vendor-agnostic surface over the NVIDIA-specific DCGMDevice. Older
  payloads decode with the new fields zero/nil; binary roundtrip is tested.
- hydration: computeDCGMDevices issues the saturation and device-metric
  query bundle and reduces the container-attributed series (dcgm-exporter
  duplicates each device value onto every pod sharing the device) to one
  value per device UUID, preserving the container-to-slice association in
  PodUsages. MIG instances hydrate as distinct UUID devices.
- GPU_SATURATION_METRICS_ENABLED gates the saturation queries (default on);
  clusters without dcgm-exporter receive empty results and unchanged
  devices.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Cliff Colvin <clifford.colvin@gmail.com>
Cliff Colvin 1 nedēļu atpakaļ
vecāks
revīzija
8b4a3933aa

+ 1 - 1
core/pkg/model/kubemodel/bingen.go

@@ -22,4 +22,4 @@ package kubemodel
 
 
 // @bingen:define[string]:github.com/opencost/opencost/core/pkg/model/shared.Provider
 // @bingen:define[string]:github.com/opencost/opencost/core/pkg/model/shared.Provider
 
 
-//go:generate bingen -package=kubemodel -version=2 -buffer=github.com/opencost/opencost/core/pkg/util
+//go:generate bingen -package=kubemodel -version=4 -buffer=github.com/opencost/opencost/core/pkg/util

+ 131 - 0
core/pkg/model/kubemodel/dcgm.go

@@ -3,6 +3,8 @@ package kubemodel
 import (
 import (
 	"fmt"
 	"fmt"
 	"time"
 	"time"
+
+	"maps"
 )
 )
 
 
 // DCGMDevice holds recording from the DCGM exporter which provides identification and usage metrics for
 // DCGMDevice holds recording from the DCGM exporter which provides identification and usage metrics for
@@ -19,6 +21,131 @@ type DCGMDevice struct {
 	Device    string             `json:"device"`
 	Device    string             `json:"device"`
 	ModelName string             `json:"modelName"`
 	ModelName string             `json:"modelName"`
 	PodUsages map[string]DCGMPod `json:"podUsages"`
 	PodUsages map[string]DCGMPod `json:"podUsages"`
+
+	// USE-method saturation signals for this device, nil when no
+	// saturation metrics were available (see DCGMDeviceSaturation)
+	Saturation *DCGMDeviceSaturation `json:"saturation,omitempty"` // @bingen:field[version=3]
+
+	// Device-level metrics from the default dcgm-exporter configuration.
+	// Compute utilization is a percentage (0-100); memory used is absolute
+	// bytes. Zero means not collected for the window.
+	PowerWatts            float64 `json:"powerWatts,omitempty"`            // @bingen:field[version=4]
+	TemperatureCelsius    float64 `json:"temperatureCelsius,omitempty"`    // @bingen:field[version=4]
+	ComputeUtilizationAvg float64 `json:"computeUtilizationAvg,omitempty"` // @bingen:field[version=4]
+	ComputeUtilizationMax float64 `json:"computeUtilizationMax,omitempty"` // @bingen:field[version=4]
+	MemoryUsedBytesAvg    float64 `json:"memoryUsedBytesAvg,omitempty"`    // @bingen:field[version=4]
+	MemoryUsedBytesMax    float64 `json:"memoryUsedBytesMax,omitempty"`    // @bingen:field[version=4]
+}
+
+var (
+	_ DeviceInfo        = (*DCGMDevice)(nil)
+	_ DevicePerformance = (*DCGMDevice)(nil)
+	_ DeviceSaturation  = (*DCGMDevice)(nil)
+)
+
+// DeviceInfo implementation. MIG parentage is not derivable from
+// dcgm-exporter labels (a MIG instance's series do not carry the physical
+// GPU UUID), so GetParent reports empty until a source for the mapping
+// exists (e.g. DRA slice attributes).
+func (d *DCGMDevice) GetIdentifier() string { return d.UUID }
+func (d *DCGMDevice) GetType() string       { return "GPU" }
+func (d *DCGMDevice) GetName() string       { return d.ModelName }
+func (d *DCGMDevice) GetPower() float64     { return d.PowerWatts }
+func (d *DCGMDevice) GetStart() time.Time   { return d.Start }
+func (d *DCGMDevice) GetEnd() time.Time     { return d.End }
+func (d *DCGMDevice) GetParent() string     { return "" }
+
+// DevicePerformance implementation, backed by device-level DCGM series:
+// GR_ENGINE_ACTIVE (scaled to percent), FB_USED (scaled to bytes), and
+// GPU_TEMP.
+func (d *DCGMDevice) GetComputeUtilizationAverage() float64 { return d.ComputeUtilizationAvg }
+func (d *DCGMDevice) GetComputeUtilizationMax() float64     { return d.ComputeUtilizationMax }
+func (d *DCGMDevice) GetMemoryUtilizationAverage() float64  { return d.MemoryUsedBytesAvg }
+func (d *DCGMDevice) GetMemoryUtilizationMax() float64      { return d.MemoryUsedBytesMax }
+func (d *DCGMDevice) GetTemp() float64                      { return d.TemperatureCelsius }
+
+// DeviceSaturation implementation. The vendor-neutral getters map onto
+// DCGM concepts: throttle violation counters DCGM_FI_DEV_*_VIOLATION,
+// throttle reason bitmask DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, framebuffer
+// occupancy FB_USED/(FB_USED+FB_FREE), error events = XID errors, memory
+// bandwidth = DRAM_ACTIVE, compute active/occupancy = SM_ACTIVE /
+// SM_OCCUPANCY, host link = PCIe, peer link = NVLink. ok=false / nil means
+// the DCGM field was unavailable, never zero.
+
+// saturationValue adapts the saturation struct's pointer fields to the
+// interface's (value, ok) presence contract, including when Saturation is
+// entirely nil.
+func (d *DCGMDevice) saturationValue(get func(*DCGMDeviceSaturation) *float64) (float64, bool) {
+	if d.Saturation == nil {
+		return 0, false
+	}
+	v := get(d.Saturation)
+	if v == nil {
+		return 0, false
+	}
+	return *v, true
+}
+
+func (d *DCGMDevice) GetThrottleViolationRatios() map[string]float64 {
+	if d.Saturation == nil {
+		return nil
+	}
+	return maps.Clone(d.Saturation.ThrottleViolationRatios)
+}
+
+func (d *DCGMDevice) GetThrottleReasonRatios() map[string]float64 {
+	if d.Saturation == nil {
+		return nil
+	}
+	return maps.Clone(d.Saturation.ThrottleReasonRatios)
+}
+
+func (d *DCGMDevice) GetMemoryUsedRatioAvg() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryUsedRatioAvg })
+}
+
+func (d *DCGMDevice) GetMemoryUsedRatioMax() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryUsedRatioMax })
+}
+
+func (d *DCGMDevice) GetMemoryPressureRatio() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.MemoryPressureRatio })
+}
+
+func (d *DCGMDevice) GetErrorEventCount() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.XIDErrorCount })
+}
+
+func (d *DCGMDevice) GetMemoryBandwidthActiveAvg() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.DRAMActiveAvg })
+}
+
+func (d *DCGMDevice) GetMemoryBandwidthActiveMax() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.DRAMActiveMax })
+}
+
+func (d *DCGMDevice) GetComputeActiveAvg() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.SMActiveAvg })
+}
+
+func (d *DCGMDevice) GetComputeOccupancyAvg() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.SMOccupancyAvg })
+}
+
+func (d *DCGMDevice) GetHostLinkTxBytesAvg() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.PCIeTxBytesAvg })
+}
+
+func (d *DCGMDevice) GetHostLinkRxBytesAvg() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.PCIeRxBytesAvg })
+}
+
+func (d *DCGMDevice) GetPeerLinkTxBytesAvg() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.NVLinkTxBytesAvg })
+}
+
+func (d *DCGMDevice) GetPeerLinkRxBytesAvg() (float64, bool) {
+	return d.saturationValue(func(s *DCGMDeviceSaturation) *float64 { return s.NVLinkRxBytesAvg })
 }
 }
 
 
 // @bingen:generate:DCGMPod
 // @bingen:generate:DCGMPod
@@ -41,6 +168,10 @@ func (d *DCGMDevice) ValidateDCGMDevice(window Window) error {
 		return err
 		return err
 	}
 	}
 
 
+	if err := d.Saturation.Validate(); err != nil {
+		return fmt.Errorf("invalid Saturation for DCGMDevice '%s': %w", d.UUID, err)
+	}
+
 	return nil
 	return nil
 }
 }
 
 

+ 134 - 0
core/pkg/model/kubemodel/dcgm_saturation.go

@@ -0,0 +1,134 @@
+package kubemodel
+
+import (
+	"fmt"
+	"math"
+
+	"maps"
+)
+
+// DCGMDeviceSaturation carries USE-method saturation signals for an NVIDIA
+// GPU recorded from the DCGM exporter: where utilization reports how busy
+// the device was, saturation reports work queued, rejected, or slowed
+// because the device could not service demand. It lives on DCGMDevice
+// because these signals are NVIDIA/DCGM-specific; the vendor-neutral
+// surface is the DeviceSaturation interface.
+//
+// Every field is an independent primitive; no composite score is computed.
+// A nil field means the underlying source metric was not available in the
+// window (no dcgm-exporter, field disabled in its config, or no DCP
+// profiling support), never that the value was zero. Ratios are fractions
+// of the observed window in [0, 1] unless noted.
+// @bingen:generate:DCGMDeviceSaturation
+type DCGMDeviceSaturation struct {
+	// ThrottleViolationRatios maps a throttle reason (power, thermal,
+	// sync_boost, board_limit) to the fraction of the window the device
+	// spent throttled for that reason, from the cumulative
+	// DCGM_FI_DEV_*_VIOLATION microsecond counters.
+	ThrottleViolationRatios map[string]float64 `json:"throttleViolationRatios,omitempty"`
+	// ThrottleReasonRatios maps a throttle reason (sw_power_cap,
+	// hw_slowdown, sync_boost, sw_thermal, hw_thermal, hw_power_brake) to
+	// the fraction of the window the corresponding bit of
+	// DCGM_FI_DEV_CLOCK_THROTTLE_REASONS was set.
+	ThrottleReasonRatios map[string]float64 `json:"throttleReasonRatios,omitempty"`
+	// MemoryUsedRatioAvg/Max are framebuffer occupancy over the window:
+	// FB_USED / (FB_USED + FB_FREE).
+	MemoryUsedRatioAvg *float64 `json:"memoryUsedRatioAvg,omitempty"`
+	MemoryUsedRatioMax *float64 `json:"memoryUsedRatioMax,omitempty"`
+	// MemoryPressureRatio is the fraction of the window framebuffer
+	// occupancy exceeded the configured threshold.
+	MemoryPressureRatio *float64 `json:"memoryPressureRatio,omitempty"`
+	// XIDErrorCount counts XID error events observed in the window.
+	XIDErrorCount *float64 `json:"xidErrorCount,omitempty"`
+	// DRAMActiveAvg/Max are the ratio of cycles the device memory interface
+	// was active (DCP profiling).
+	DRAMActiveAvg *float64 `json:"dramActiveAvg,omitempty"`
+	DRAMActiveMax *float64 `json:"dramActiveMax,omitempty"`
+	// SMActiveAvg and SMOccupancyAvg distinguish compute-bound from
+	// bandwidth- or latency-bound saturation (DCP profiling).
+	SMActiveAvg    *float64 `json:"smActiveAvg,omitempty"`
+	SMOccupancyAvg *float64 `json:"smOccupancyAvg,omitempty"`
+	// Interconnect throughput in bytes/sec (DCP profiling). Link capacity
+	// is not derivable from DCGM, so these are raw rates, not ratios.
+	PCIeTxBytesAvg   *float64 `json:"pcieTxBytesAvg,omitempty"`
+	PCIeRxBytesAvg   *float64 `json:"pcieRxBytesAvg,omitempty"`
+	NVLinkTxBytesAvg *float64 `json:"nvlinkTxBytesAvg,omitempty"`
+	NVLinkRxBytesAvg *float64 `json:"nvlinkRxBytesAvg,omitempty"`
+}
+
+// ratioFields returns name/value pairs for every field constrained to
+// [0, 1], for validation.
+func (s *DCGMDeviceSaturation) ratioFields() map[string]*float64 {
+	return map[string]*float64{
+		"MemoryUsedRatioAvg":  s.MemoryUsedRatioAvg,
+		"MemoryUsedRatioMax":  s.MemoryUsedRatioMax,
+		"MemoryPressureRatio": s.MemoryPressureRatio,
+		"DRAMActiveAvg":       s.DRAMActiveAvg,
+		"DRAMActiveMax":       s.DRAMActiveMax,
+		"SMActiveAvg":         s.SMActiveAvg,
+		"SMOccupancyAvg":      s.SMOccupancyAvg,
+	}
+}
+
+// Validate validates the DCGMDeviceSaturation fields.
+func (s *DCGMDeviceSaturation) Validate() error {
+	if s == nil {
+		return nil
+	}
+	for _, m := range []map[string]float64{s.ThrottleViolationRatios, s.ThrottleReasonRatios} {
+		for reason, ratio := range m {
+			if math.IsNaN(ratio) || ratio < 0 || ratio > 1 {
+				return fmt.Errorf("throttle ratio for %q must be 0-1, got %v", reason, ratio)
+			}
+		}
+	}
+	for name, value := range s.ratioFields() {
+		if value != nil && (math.IsNaN(*value) || *value < 0 || *value > 1) {
+			return fmt.Errorf("%s must be 0-1, got %v", name, *value)
+		}
+	}
+	for name, value := range map[string]*float64{
+		"XIDErrorCount":    s.XIDErrorCount,
+		"PCIeTxBytesAvg":   s.PCIeTxBytesAvg,
+		"PCIeRxBytesAvg":   s.PCIeRxBytesAvg,
+		"NVLinkTxBytesAvg": s.NVLinkTxBytesAvg,
+		"NVLinkRxBytesAvg": s.NVLinkRxBytesAvg,
+	} {
+		if value != nil && (math.IsNaN(*value) || *value < 0) {
+			return fmt.Errorf("%s cannot be negative, got %v", name, *value)
+		}
+	}
+	return nil
+}
+
+// Clone creates a deep copy of the DCGMDeviceSaturation.
+func (s *DCGMDeviceSaturation) Clone() *DCGMDeviceSaturation {
+	if s == nil {
+		return nil
+	}
+
+	cloneFloat := func(v *float64) *float64 {
+		if v == nil {
+			return nil
+		}
+		c := *v
+		return &c
+	}
+
+	return &DCGMDeviceSaturation{
+		ThrottleViolationRatios: maps.Clone(s.ThrottleViolationRatios),
+		ThrottleReasonRatios:    maps.Clone(s.ThrottleReasonRatios),
+		MemoryUsedRatioAvg:      cloneFloat(s.MemoryUsedRatioAvg),
+		MemoryUsedRatioMax:      cloneFloat(s.MemoryUsedRatioMax),
+		MemoryPressureRatio:     cloneFloat(s.MemoryPressureRatio),
+		XIDErrorCount:           cloneFloat(s.XIDErrorCount),
+		DRAMActiveAvg:           cloneFloat(s.DRAMActiveAvg),
+		DRAMActiveMax:           cloneFloat(s.DRAMActiveMax),
+		SMActiveAvg:             cloneFloat(s.SMActiveAvg),
+		SMOccupancyAvg:          cloneFloat(s.SMOccupancyAvg),
+		PCIeTxBytesAvg:          cloneFloat(s.PCIeTxBytesAvg),
+		PCIeRxBytesAvg:          cloneFloat(s.PCIeRxBytesAvg),
+		NVLinkTxBytesAvg:        cloneFloat(s.NVLinkTxBytesAvg),
+		NVLinkRxBytesAvg:        cloneFloat(s.NVLinkRxBytesAvg),
+	}
+}

+ 281 - 0
core/pkg/model/kubemodel/dcgm_saturation_test.go

@@ -0,0 +1,281 @@
+package kubemodel
+
+import (
+	"math"
+	"reflect"
+	"testing"
+	"time"
+)
+
+func saturationFloat(v float64) *float64 {
+	return &v
+}
+
+func mockDCGMDeviceSaturation() *DCGMDeviceSaturation {
+	return &DCGMDeviceSaturation{
+		ThrottleViolationRatios: map[string]float64{"power": 0.25},
+		ThrottleReasonRatios:    map[string]float64{"sw_power_cap": 0.2},
+		MemoryUsedRatioAvg:      saturationFloat(0.85),
+		MemoryUsedRatioMax:      saturationFloat(0.99),
+		MemoryPressureRatio:     saturationFloat(0.4),
+		XIDErrorCount:           saturationFloat(2),
+		DRAMActiveAvg:           saturationFloat(0.7),
+		DRAMActiveMax:           saturationFloat(0.95),
+		SMActiveAvg:             saturationFloat(0.6),
+		SMOccupancyAvg:          saturationFloat(0.5),
+		PCIeTxBytesAvg:          saturationFloat(1.5e9),
+		PCIeRxBytesAvg:          saturationFloat(2.5e9),
+		NVLinkTxBytesAvg:        saturationFloat(3.5e9),
+		NVLinkRxBytesAvg:        saturationFloat(4.5e9),
+	}
+}
+
+func mockSaturatedDCGMDevice() *DCGMDevice {
+	return &DCGMDevice{
+		UUID:                  "GPU-1",
+		Start:                 time.Date(2026, 6, 1, 0, 0, 0, 0, time.UTC),
+		End:                   time.Date(2026, 6, 1, 1, 0, 0, 0, time.UTC),
+		Device:                "nvidia0",
+		ModelName:             "NVIDIA A100 80GB",
+		Saturation:            mockDCGMDeviceSaturation(),
+		PowerWatts:            140,
+		TemperatureCelsius:    55,
+		ComputeUtilizationAvg: 42.5,
+		ComputeUtilizationMax: 97,
+		MemoryUsedBytesAvg:    32e9,
+		MemoryUsedBytesMax:    71e9,
+	}
+}
+
+// TestDCGMDevice_DeviceInfoInterface verifies the vendor-neutral identity
+// surface on the DCGM device.
+func TestDCGMDevice_DeviceInfoInterface(t *testing.T) {
+	var info DeviceInfo = mockSaturatedDCGMDevice()
+
+	if info.GetIdentifier() != "GPU-1" {
+		t.Errorf("GetIdentifier() = %q, want GPU-1", info.GetIdentifier())
+	}
+	if info.GetType() != "GPU" {
+		t.Errorf("GetType() = %q, want GPU", info.GetType())
+	}
+	if info.GetName() != "NVIDIA A100 80GB" {
+		t.Errorf("GetName() = %q", info.GetName())
+	}
+	if !info.GetStart().Before(info.GetEnd()) {
+		t.Errorf("GetStart() not before GetEnd()")
+	}
+	if info.GetPower() != 140 {
+		t.Errorf("GetPower() = %v, want 140", info.GetPower())
+	}
+	// MIG parentage is not derivable from dcgm-exporter labels yet
+	if info.GetParent() != "" {
+		t.Errorf("expected empty parent until a mapping source exists, got %q", info.GetParent())
+	}
+}
+
+// TestDCGMDevice_DevicePerformanceInterface verifies the performance surface
+// is backed by the device-level metric fields with documented units.
+func TestDCGMDevice_DevicePerformanceInterface(t *testing.T) {
+	var perf DevicePerformance = mockSaturatedDCGMDevice()
+
+	if perf.GetComputeUtilizationAverage() != 42.5 || perf.GetComputeUtilizationMax() != 97 {
+		t.Errorf("compute utilization = (%v, %v), want (42.5, 97)", perf.GetComputeUtilizationAverage(), perf.GetComputeUtilizationMax())
+	}
+	if perf.GetMemoryUtilizationAverage() != 32e9 || perf.GetMemoryUtilizationMax() != 71e9 {
+		t.Errorf("memory utilization = (%v, %v), want (3.2e10, 7.1e10)", perf.GetMemoryUtilizationAverage(), perf.GetMemoryUtilizationMax())
+	}
+	if perf.GetTemp() != 55 {
+		t.Errorf("GetTemp() = %v, want 55", perf.GetTemp())
+	}
+}
+
+// TestDCGMDevice_DeviceSaturationInterface verifies the saturation getters
+// expose every signal with correct values, that absence (nil Saturation or
+// nil field) reports ok=false / nil rather than zero, and that returned
+// maps are copies.
+func TestDCGMDevice_DeviceSaturationInterface(t *testing.T) {
+	var sat DeviceSaturation = mockSaturatedDCGMDevice()
+
+	scalarChecks := map[string]struct {
+		get  func() (float64, bool)
+		want float64
+	}{
+		"MemoryUsedRatioAvg":       {sat.GetMemoryUsedRatioAvg, 0.85},
+		"MemoryUsedRatioMax":       {sat.GetMemoryUsedRatioMax, 0.99},
+		"MemoryPressureRatio":      {sat.GetMemoryPressureRatio, 0.4},
+		"ErrorEventCount":          {sat.GetErrorEventCount, 2},
+		"MemoryBandwidthActiveAvg": {sat.GetMemoryBandwidthActiveAvg, 0.7},
+		"MemoryBandwidthActiveMax": {sat.GetMemoryBandwidthActiveMax, 0.95},
+		"ComputeActiveAvg":         {sat.GetComputeActiveAvg, 0.6},
+		"ComputeOccupancyAvg":      {sat.GetComputeOccupancyAvg, 0.5},
+		"HostLinkTxBytesAvg":       {sat.GetHostLinkTxBytesAvg, 1.5e9},
+		"HostLinkRxBytesAvg":       {sat.GetHostLinkRxBytesAvg, 2.5e9},
+		"PeerLinkTxBytesAvg":       {sat.GetPeerLinkTxBytesAvg, 3.5e9},
+		"PeerLinkRxBytesAvg":       {sat.GetPeerLinkRxBytesAvg, 4.5e9},
+	}
+	for name, check := range scalarChecks {
+		got, ok := check.get()
+		if !ok || got != check.want {
+			t.Errorf("%s = (%v, %v), want (%v, true)", name, got, ok, check.want)
+		}
+	}
+
+	if got := sat.GetThrottleViolationRatios(); got["power"] != 0.25 {
+		t.Errorf("GetThrottleViolationRatios() = %v, want power: 0.25", got)
+	}
+	if got := sat.GetThrottleReasonRatios(); got["sw_power_cap"] != 0.2 {
+		t.Errorf("GetThrottleReasonRatios() = %v, want sw_power_cap: 0.2", got)
+	}
+
+	// returned maps are copies: consumers must not mutate device state
+	sat.GetThrottleViolationRatios()["power"] = 0.99
+	if got := sat.GetThrottleViolationRatios(); got["power"] != 0.25 {
+		t.Errorf("interface exposed internal throttle map: %v", got)
+	}
+
+	// absence semantics: nil Saturation reports ok=false / nil, never zero
+	var absent DeviceSaturation = &DCGMDevice{UUID: "GPU-2"}
+	if _, ok := absent.GetMemoryUsedRatioAvg(); ok {
+		t.Errorf("expected ok=false for device without saturation")
+	}
+	if got := absent.GetThrottleViolationRatios(); got != nil {
+		t.Errorf("expected nil throttle map for device without saturation, got %v", got)
+	}
+
+	// per-field absence: present Saturation with one nil field
+	partial := mockSaturatedDCGMDevice()
+	partial.Saturation.SMActiveAvg = nil
+	if _, ok := partial.GetComputeActiveAvg(); ok {
+		t.Errorf("expected ok=false for absent SM active signal")
+	}
+	if got, ok := partial.GetComputeOccupancyAvg(); !ok || got != 0.5 {
+		t.Errorf("sibling signal lost: (%v, %v), want (0.5, true)", got, ok)
+	}
+}
+
+func TestDCGMDeviceSaturation_Validate(t *testing.T) {
+	cases := map[string]struct {
+		mutate  func(*DCGMDeviceSaturation)
+		wantErr bool
+	}{
+		"valid":  {mutate: func(s *DCGMDeviceSaturation) {}, wantErr: false},
+		"nil ok": {mutate: nil, wantErr: false},
+		"empty":  {mutate: func(s *DCGMDeviceSaturation) { *s = DCGMDeviceSaturation{} }, wantErr: false},
+		"ratio above one": {
+			mutate:  func(s *DCGMDeviceSaturation) { s.MemoryUsedRatioAvg = saturationFloat(1.1) },
+			wantErr: true,
+		},
+		"negative ratio": {
+			mutate:  func(s *DCGMDeviceSaturation) { s.SMActiveAvg = saturationFloat(-0.1) },
+			wantErr: true,
+		},
+		"NaN ratio": {
+			mutate:  func(s *DCGMDeviceSaturation) { s.DRAMActiveAvg = saturationFloat(math.NaN()) },
+			wantErr: true,
+		},
+		"throttle map ratio above one": {
+			mutate:  func(s *DCGMDeviceSaturation) { s.ThrottleReasonRatios["sw_power_cap"] = 1.5 },
+			wantErr: true,
+		},
+		"negative byte rate": {
+			mutate:  func(s *DCGMDeviceSaturation) { s.PCIeTxBytesAvg = saturationFloat(-1) },
+			wantErr: true,
+		},
+		"negative xid count": {
+			mutate:  func(s *DCGMDeviceSaturation) { s.XIDErrorCount = saturationFloat(-1) },
+			wantErr: true,
+		},
+	}
+
+	for name, tc := range cases {
+		t.Run(name, func(t *testing.T) {
+			var sat *DCGMDeviceSaturation
+			if tc.mutate != nil {
+				sat = mockDCGMDeviceSaturation()
+				tc.mutate(sat)
+			}
+			err := sat.Validate()
+			if (err != nil) != tc.wantErr {
+				t.Errorf("Validate() error = %v, wantErr %v", err, tc.wantErr)
+			}
+		})
+	}
+
+	// invalid saturation must fail device validation too
+	device := mockSaturatedDCGMDevice()
+	device.Saturation.MemoryUsedRatioAvg = saturationFloat(2.0)
+	window := Window{Start: device.Start, End: device.End}
+	if err := device.ValidateDCGMDevice(window); err == nil {
+		t.Errorf("expected ValidateDCGMDevice to reject invalid saturation")
+	}
+}
+
+func TestDCGMDeviceSaturation_Clone(t *testing.T) {
+	var nilSat *DCGMDeviceSaturation
+	if nilSat.Clone() != nil {
+		t.Fatalf("expected nil Clone of nil DCGMDeviceSaturation")
+	}
+
+	orig := mockDCGMDeviceSaturation()
+	clone := orig.Clone()
+
+	if !reflect.DeepEqual(orig, clone) {
+		t.Fatalf("clone differs from original:\n got %+v\nwant %+v", clone, orig)
+	}
+
+	clone.ThrottleViolationRatios["power"] = 0.99
+	*clone.MemoryUsedRatioAvg = 0.1
+	if orig.ThrottleViolationRatios["power"] == 0.99 {
+		t.Errorf("clone shares ThrottleViolationRatios map with original")
+	}
+	if *orig.MemoryUsedRatioAvg == 0.1 {
+		t.Errorf("clone shares MemoryUsedRatioAvg pointer with original")
+	}
+}
+
+// TestDCGMDevice_BinaryRoundtripWithSaturation verifies a DCGMDevice
+// carrying saturation survives the bingen binary codec, and that absent
+// saturation stays absent.
+func TestDCGMDevice_BinaryRoundtripWithSaturation(t *testing.T) {
+	cases := map[string]*DCGMDevice{
+		"nil saturation": {
+			UUID:      "GPU-1",
+			Device:    "nvidia0",
+			ModelName: "Tesla T4",
+		},
+		"populated saturation": mockSaturatedDCGMDevice(),
+		"empty saturation struct": {
+			UUID:       "GPU-1",
+			Saturation: &DCGMDeviceSaturation{},
+		},
+	}
+
+	for name, orig := range cases {
+		t.Run(name, func(t *testing.T) {
+			bs, err := orig.MarshalBinary()
+			if err != nil {
+				t.Fatalf("MarshalBinary: %s", err)
+			}
+
+			decoded := new(DCGMDevice)
+			if err := decoded.UnmarshalBinary(bs); err != nil {
+				t.Fatalf("UnmarshalBinary: %s", err)
+			}
+
+			if decoded.UUID != orig.UUID || decoded.Device != orig.Device || decoded.ModelName != orig.ModelName {
+				t.Errorf("device identity did not survive roundtrip: got %+v, want %+v", decoded, orig)
+			}
+			if decoded.PowerWatts != orig.PowerWatts || decoded.TemperatureCelsius != orig.TemperatureCelsius ||
+				decoded.ComputeUtilizationAvg != orig.ComputeUtilizationAvg || decoded.ComputeUtilizationMax != orig.ComputeUtilizationMax ||
+				decoded.MemoryUsedBytesAvg != orig.MemoryUsedBytesAvg || decoded.MemoryUsedBytesMax != orig.MemoryUsedBytesMax {
+				t.Errorf("device metrics did not survive roundtrip: got %+v, want %+v", decoded, orig)
+			}
+			if (decoded.Saturation == nil) != (orig.Saturation == nil) {
+				t.Fatalf("saturation presence did not survive roundtrip: got %v, want %v", decoded.Saturation, orig.Saturation)
+			}
+			if !reflect.DeepEqual(decoded.Saturation, orig.Saturation) {
+				t.Errorf("saturation did not survive roundtrip: got %+v, want %+v", decoded.Saturation, orig.Saturation)
+			}
+		})
+	}
+}

+ 136 - 0
core/pkg/model/kubemodel/device_interface.go

@@ -0,0 +1,136 @@
+package kubemodel
+
+import "time"
+
+// Accelerator devices are modeled as per-vendor concrete types (e.g.
+// DCGMDevice for NVIDIA) behind small interfaces rather than one generic
+// concrete struct. A generic type forces every vendor's changes into shared
+// code, implies relational fields not all vendors can provide, and blurs
+// semantics that differ between vendors (e.g. what "power" means for AMD vs
+// NVIDIA). Overly specific shared models have the opposite failure:
+// collecting more than is used and looking half-supported for vendors
+// missing a feature. Vendor structs keep agent-side transformation minimal
+// (data ships close to its source shape, e.g. DCGM for NVIDIA), while these
+// interfaces provide the common surface for identification, utilization,
+// performance, and saturation.
+//
+// Capacity is intentionally absent: it is not readily available from vendor
+// telemetry (e.g. DCGM) and is inferred via device plugins / DRA, which are
+// assumed prerequisites and provide the request half of device allocation.
+// Fan speed, PCIe topology detail, and topology are out of scope.
+
+// DeviceInfo identifies a physical or virtualized accelerator device and
+// its lifecycle within the observed window.
+type DeviceInfo interface {
+	// GetIdentifier returns the device UUID. For virtualized devices
+	// (e.g. a MIG instance) this is the UUID of the virtual device.
+	GetIdentifier() string
+
+	// GetType returns the human-readable device type, e.g. "GPU".
+	GetType() string
+
+	// GetName returns the marketing/model name, e.g. "NVIDIA A100 80GB".
+	GetName() string
+
+	// GetPower returns the device power draw in watts.
+	GetPower() float64
+
+	// GetStart returns the time the device came online in the observed
+	// window.
+	GetStart() time.Time
+
+	// GetEnd returns the time the device's observation ended.
+	GetEnd() time.Time
+
+	// GetParent returns the UUID of the parent device, or empty if none.
+	// For a MIG instance this is the UUID of the physical GPU.
+	GetParent() string
+}
+
+// DevicePerformance exposes utilization and performance measures for an
+// accelerator device.
+type DevicePerformance interface {
+	// GetComputeUtilizationAverage returns the average percentage of
+	// compute used over the observed window.
+	GetComputeUtilizationAverage() float64
+
+	// GetComputeUtilizationMax returns the max percentage of compute used.
+	GetComputeUtilizationMax() float64
+
+	// GetMemoryUtilizationAverage returns the average absolute memory used
+	// in bytes.
+	GetMemoryUtilizationAverage() float64
+
+	// GetMemoryUtilizationMax returns the max absolute memory used in
+	// bytes.
+	GetMemoryUtilizationMax() float64
+
+	// GetTemp returns the current device temperature in degrees Celsius.
+	GetTemp() float64
+}
+
+// DeviceSaturation exposes USE-method saturation measures for an
+// accelerator device: where DevicePerformance reports how busy the device
+// was, saturation reports work queued, rejected, or slowed because the
+// device could not service demand.
+//
+// Absence is meaningful and distinct from zero: scalar getters return
+// ok=false and map getters return nil when the vendor telemetry for that
+// signal was unavailable, so consumers never mistake "not measured" for
+// "not saturated". Names are vendor-neutral; the mapping to vendor
+// telemetry (e.g. DCGM fields for NVIDIA) lives in each implementation.
+// Ratios are fractions of the observed window in [0, 1].
+type DeviceSaturation interface {
+	// GetThrottleViolationRatios returns the fraction of the window the
+	// device spent throttled, keyed by vendor-defined reason name, from
+	// cumulative time-in-violation counters. Nil when unavailable.
+	GetThrottleViolationRatios() map[string]float64
+
+	// GetThrottleReasonRatios returns the fraction of the window each
+	// vendor-defined throttle reason state was active, from sampled state
+	// flags. Nil when unavailable.
+	GetThrottleReasonRatios() map[string]float64
+
+	// GetMemoryUsedRatioAvg returns average device-memory occupancy.
+	GetMemoryUsedRatioAvg() (float64, bool)
+
+	// GetMemoryUsedRatioMax returns peak device-memory occupancy.
+	GetMemoryUsedRatioMax() (float64, bool)
+
+	// GetMemoryPressureRatio returns the fraction of the window
+	// device-memory occupancy exceeded the configured threshold.
+	GetMemoryPressureRatio() (float64, bool)
+
+	// GetErrorEventCount returns the number of device error events
+	// (rejected work) observed in the window.
+	GetErrorEventCount() (float64, bool)
+
+	// GetMemoryBandwidthActiveAvg returns the average ratio of cycles the
+	// device memory interface was active.
+	GetMemoryBandwidthActiveAvg() (float64, bool)
+
+	// GetMemoryBandwidthActiveMax returns the peak ratio of cycles the
+	// device memory interface was active.
+	GetMemoryBandwidthActiveMax() (float64, bool)
+
+	// GetComputeActiveAvg returns the average ratio of cycles compute
+	// units had work resident. Together with GetComputeOccupancyAvg and
+	// GetMemoryBandwidthActiveAvg, consumers can distinguish
+	// compute-bound from bandwidth- or latency-bound saturation.
+	GetComputeActiveAvg() (float64, bool)
+
+	// GetComputeOccupancyAvg returns the average ratio of resident work
+	// to the compute units' maximum.
+	GetComputeOccupancyAvg() (float64, bool)
+
+	// GetHostLinkTxBytesAvg/RxBytesAvg return average host-interconnect
+	// throughput (e.g. PCIe) in bytes/sec. Link capacity is not derivable
+	// from vendor telemetry, so these are rates, not ratios.
+	GetHostLinkTxBytesAvg() (float64, bool)
+	GetHostLinkRxBytesAvg() (float64, bool)
+
+	// GetPeerLinkTxBytesAvg/RxBytesAvg return average device-to-device
+	// fabric throughput (e.g. NVLink) in bytes/sec.
+	GetPeerLinkTxBytesAvg() (float64, bool)
+	GetPeerLinkRxBytesAvg() (float64, bool)
+}

+ 509 - 27
core/pkg/model/kubemodel/kubemodel_codecs.go

@@ -35,7 +35,7 @@ const (
 	BinaryTagStringTable string = "BGST"
 	BinaryTagStringTable string = "BGST"
 
 
 	// DefaultCodecVersion is used for any resources listed in the Default version set
 	// DefaultCodecVersion is used for any resources listed in the Default version set
-	DefaultCodecVersion uint8 = 2
+	DefaultCodecVersion uint8 = 4
 )
 )
 
 
 //--------------------------------------------------------------------------
 //--------------------------------------------------------------------------
@@ -106,6 +106,7 @@ var typeMap map[string]reflect.Type = map[string]reflect.Type{
 	"CronJob":                 reflect.TypeFor[CronJob](),
 	"CronJob":                 reflect.TypeFor[CronJob](),
 	"DCGMContainer":           reflect.TypeFor[DCGMContainer](),
 	"DCGMContainer":           reflect.TypeFor[DCGMContainer](),
 	"DCGMDevice":              reflect.TypeFor[DCGMDevice](),
 	"DCGMDevice":              reflect.TypeFor[DCGMDevice](),
+	"DCGMDeviceSaturation":    reflect.TypeFor[DCGMDeviceSaturation](),
 	"DCGMPod":                 reflect.TypeFor[DCGMPod](),
 	"DCGMPod":                 reflect.TypeFor[DCGMPod](),
 	"DaemonSet":               reflect.TypeFor[DaemonSet](),
 	"DaemonSet":               reflect.TypeFor[DaemonSet](),
 	"Deployment":              reflect.TypeFor[Deployment](),
 	"Deployment":              reflect.TypeFor[Deployment](),
@@ -1798,6 +1799,32 @@ func (target *DCGMDevice) MarshalBinaryWithContext(ctx *EncodingContext) (err er
 		// --- [end][write][map](map[string]DCGMPod) ---
 		// --- [end][write][map](map[string]DCGMPod) ---
 
 
 	}
 	}
+	if target.Saturation == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		// --- [begin][write][struct](DCGMDeviceSaturation) ---
+		buff.WriteInt(0) // [compatibility, unused]
+		errD := target.Saturation.MarshalBinaryWithContext(ctx)
+		if errD != nil {
+			return errD
+		}
+		// --- [end][write][struct](DCGMDeviceSaturation) ---
+
+	}
+
+	buff.WriteFloat64(target.PowerWatts) // write float64
+
+	buff.WriteFloat64(target.TemperatureCelsius) // write float64
+
+	buff.WriteFloat64(target.ComputeUtilizationAvg) // write float64
+
+	buff.WriteFloat64(target.ComputeUtilizationMax) // write float64
+
+	buff.WriteFloat64(target.MemoryUsedBytesAvg) // write float64
+
+	buff.WriteFloat64(target.MemoryUsedBytesMax) // write float64
 
 
 	return nil
 	return nil
 }
 }
@@ -1940,6 +1967,484 @@ func (target *DCGMDevice) UnmarshalBinaryWithContext(ctx *DecodingContext) (err
 
 
 	}
 	}
 
 
+	// field version check
+	if uint8(3) <= version {
+		if buff.ReadUInt8() == uint8(0) {
+			target.Saturation = nil
+		} else {
+
+			// --- [begin][read][struct](DCGMDeviceSaturation) ---
+			aa := new(DCGMDeviceSaturation)
+			buff.ReadInt() // [compatibility, unused]
+			errD := aa.UnmarshalBinaryWithContext(ctx)
+			if errD != nil {
+				return errD
+			}
+			target.Saturation = aa
+			// --- [end][read][struct](DCGMDeviceSaturation) ---
+
+		}
+
+	} else {
+		target.Saturation = nil
+	}
+	// field version check
+	if uint8(4) <= version {
+
+		bb := buff.ReadFloat64() // read float64
+		target.PowerWatts = bb
+
+	} else {
+		target.PowerWatts = float64(0) // default
+	}
+	// field version check
+	if uint8(4) <= version {
+
+		cc := buff.ReadFloat64() // read float64
+		target.TemperatureCelsius = cc
+
+	} else {
+		target.TemperatureCelsius = float64(0) // default
+	}
+	// field version check
+	if uint8(4) <= version {
+
+		dd := buff.ReadFloat64() // read float64
+		target.ComputeUtilizationAvg = dd
+
+	} else {
+		target.ComputeUtilizationAvg = float64(0) // default
+	}
+	// field version check
+	if uint8(4) <= version {
+
+		ee := buff.ReadFloat64() // read float64
+		target.ComputeUtilizationMax = ee
+
+	} else {
+		target.ComputeUtilizationMax = float64(0) // default
+	}
+	// field version check
+	if uint8(4) <= version {
+
+		ff := buff.ReadFloat64() // read float64
+		target.MemoryUsedBytesAvg = ff
+
+	} else {
+		target.MemoryUsedBytesAvg = float64(0) // default
+	}
+	// field version check
+	if uint8(4) <= version {
+
+		gg := buff.ReadFloat64() // read float64
+		target.MemoryUsedBytesMax = gg
+
+	} else {
+		target.MemoryUsedBytesMax = float64(0) // default
+	}
+
+	return nil
+}
+
+//--------------------------------------------------------------------------
+//  DCGMDeviceSaturation
+//--------------------------------------------------------------------------
+
+// MarshalBinary serializes the internal properties of this DCGMDeviceSaturation instance
+// into a byte array
+func (target *DCGMDeviceSaturation) MarshalBinary() (data []byte, err error) {
+	ctx := &EncodingContext{
+		Buffer: util.NewBuffer(),
+		Table:  nil,
+	}
+
+	e := target.MarshalBinaryWithContext(ctx)
+	if e != nil {
+		return nil, e
+	}
+
+	encBytes := ctx.Buffer.Bytes()
+	return encBytes, nil
+}
+
+// MarshalBinaryWithContext serializes the internal properties of this DCGMDeviceSaturation instance
+// into a byte array leveraging a predefined context.
+func (target *DCGMDeviceSaturation) MarshalBinaryWithContext(ctx *EncodingContext) (err error) {
+	// panics are recovered and propagated as errors
+	defer func() {
+		if r := recover(); r != nil {
+			if e, ok := r.(error); ok {
+				err = e
+			} else if s, ok := r.(string); ok {
+				err = fmt.Errorf("unexpected panic: %s", s)
+			} else {
+				err = fmt.Errorf("unexpected panic: %+v", r)
+			}
+		}
+	}()
+
+	buff := ctx.Buffer
+	buff.WriteUInt8(DefaultCodecVersion) // version
+
+	if target.ThrottleViolationRatios == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		// --- [begin][write][map](map[string]float64) ---
+		buff.WriteInt(len(target.ThrottleViolationRatios)) // map length
+		for v, z := range target.ThrottleViolationRatios {
+			if ctx.IsStringTable() {
+				a := ctx.Table.AddOrGet(v)
+				buff.WriteInt(a) // write table index
+			} else {
+				buff.WriteString(v) // write string
+			}
+
+			buff.WriteFloat64(z) // write float64
+
+		}
+		// --- [end][write][map](map[string]float64) ---
+
+	}
+	if target.ThrottleReasonRatios == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		// --- [begin][write][map](map[string]float64) ---
+		buff.WriteInt(len(target.ThrottleReasonRatios)) // map length
+		for vv, zz := range target.ThrottleReasonRatios {
+			if ctx.IsStringTable() {
+				b := ctx.Table.AddOrGet(vv)
+				buff.WriteInt(b) // write table index
+			} else {
+				buff.WriteString(vv) // write string
+			}
+
+			buff.WriteFloat64(zz) // write float64
+
+		}
+		// --- [end][write][map](map[string]float64) ---
+
+	}
+	if target.MemoryUsedRatioAvg == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.MemoryUsedRatioAvg) // write float64
+
+	}
+	if target.MemoryUsedRatioMax == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.MemoryUsedRatioMax) // write float64
+
+	}
+	if target.MemoryPressureRatio == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.MemoryPressureRatio) // write float64
+
+	}
+	if target.XIDErrorCount == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.XIDErrorCount) // write float64
+
+	}
+	if target.DRAMActiveAvg == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.DRAMActiveAvg) // write float64
+
+	}
+	if target.DRAMActiveMax == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.DRAMActiveMax) // write float64
+
+	}
+	if target.SMActiveAvg == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.SMActiveAvg) // write float64
+
+	}
+	if target.SMOccupancyAvg == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.SMOccupancyAvg) // write float64
+
+	}
+	if target.PCIeTxBytesAvg == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.PCIeTxBytesAvg) // write float64
+
+	}
+	if target.PCIeRxBytesAvg == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.PCIeRxBytesAvg) // write float64
+
+	}
+	if target.NVLinkTxBytesAvg == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.NVLinkTxBytesAvg) // write float64
+
+	}
+	if target.NVLinkRxBytesAvg == nil {
+		buff.WriteUInt8(uint8(0)) // write nil byte
+	} else {
+		buff.WriteUInt8(uint8(1)) // write non-nil byte
+
+		buff.WriteFloat64(*target.NVLinkRxBytesAvg) // write float64
+
+	}
+
+	return nil
+}
+
+// UnmarshalBinary uses the data passed byte array to set all the internal properties of
+// the DCGMDeviceSaturation type
+func (target *DCGMDeviceSaturation) UnmarshalBinary(data []byte) error {
+	ctx := NewDecodingContextFromBytes(data)
+	defer ctx.Close()
+
+	err := target.UnmarshalBinaryWithContext(ctx)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// UnmarshalBinaryFromReader uses the io.Reader data to set all the internal properties of
+// the DCGMDeviceSaturation type
+func (target *DCGMDeviceSaturation) UnmarshalBinaryFromReader(reader io.Reader) error {
+	ctx := NewDecodingContextFromReader(reader)
+	defer ctx.Close()
+
+	err := target.UnmarshalBinaryWithContext(ctx)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// UnmarshalBinaryWithContext uses the context containing a string table and binary buffer to set all the internal properties of
+// the DCGMDeviceSaturation type
+func (target *DCGMDeviceSaturation) UnmarshalBinaryWithContext(ctx *DecodingContext) (err error) {
+	// panics are recovered and propagated as errors
+	defer func() {
+		if r := recover(); r != nil {
+			if e, ok := r.(error); ok {
+				err = e
+			} else if s, ok := r.(string); ok {
+				err = fmt.Errorf("unexpected panic: %s", s)
+			} else {
+				err = fmt.Errorf("unexpected panic: %+v", r)
+			}
+		}
+	}()
+
+	buff := ctx.Buffer
+	version := buff.ReadUInt8()
+
+	if version > DefaultCodecVersion {
+		return fmt.Errorf("Invalid Version Unmarshalling DCGMDeviceSaturation. Expected %d or less, got %d", DefaultCodecVersion, version)
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.ThrottleViolationRatios = nil
+	} else {
+		// --- [begin][read][map](map[string]float64) ---
+		b := buff.ReadInt() // map len
+		a := make(map[string]float64, b)
+		for range b {
+			var v string
+			var d string
+			if ctx.IsStringTable() {
+				e := buff.ReadInt() // read string index
+				d = ctx.Table.At(e)
+			} else {
+				d = buff.ReadString() // read string
+			}
+			c := d
+			v = c
+
+			var z float64
+			f := buff.ReadFloat64() // read float64
+			z = f
+
+			a[v] = z
+		}
+		target.ThrottleViolationRatios = a
+		// --- [end][read][map](map[string]float64) ---
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.ThrottleReasonRatios = nil
+	} else {
+		// --- [begin][read][map](map[string]float64) ---
+		h := buff.ReadInt() // map len
+		g := make(map[string]float64, h)
+		for range h {
+			var vv string
+			var m string
+			if ctx.IsStringTable() {
+				n := buff.ReadInt() // read string index
+				m = ctx.Table.At(n)
+			} else {
+				m = buff.ReadString() // read string
+			}
+			l := m
+			vv = l
+
+			var zz float64
+			o := buff.ReadFloat64() // read float64
+			zz = o
+
+			g[vv] = zz
+		}
+		target.ThrottleReasonRatios = g
+		// --- [end][read][map](map[string]float64) ---
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.MemoryUsedRatioAvg = nil
+	} else {
+
+		p := buff.ReadFloat64() // read float64
+		target.MemoryUsedRatioAvg = &p
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.MemoryUsedRatioMax = nil
+	} else {
+
+		q := buff.ReadFloat64() // read float64
+		target.MemoryUsedRatioMax = &q
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.MemoryPressureRatio = nil
+	} else {
+
+		r := buff.ReadFloat64() // read float64
+		target.MemoryPressureRatio = &r
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.XIDErrorCount = nil
+	} else {
+
+		s := buff.ReadFloat64() // read float64
+		target.XIDErrorCount = &s
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.DRAMActiveAvg = nil
+	} else {
+
+		t := buff.ReadFloat64() // read float64
+		target.DRAMActiveAvg = &t
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.DRAMActiveMax = nil
+	} else {
+
+		u := buff.ReadFloat64() // read float64
+		target.DRAMActiveMax = &u
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.SMActiveAvg = nil
+	} else {
+
+		w := buff.ReadFloat64() // read float64
+		target.SMActiveAvg = &w
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.SMOccupancyAvg = nil
+	} else {
+
+		x := buff.ReadFloat64() // read float64
+		target.SMOccupancyAvg = &x
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.PCIeTxBytesAvg = nil
+	} else {
+
+		y := buff.ReadFloat64() // read float64
+		target.PCIeTxBytesAvg = &y
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.PCIeRxBytesAvg = nil
+	} else {
+
+		aa := buff.ReadFloat64() // read float64
+		target.PCIeRxBytesAvg = &aa
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.NVLinkTxBytesAvg = nil
+	} else {
+
+		bb := buff.ReadFloat64() // read float64
+		target.NVLinkTxBytesAvg = &bb
+
+	}
+
+	if buff.ReadUInt8() == uint8(0) {
+		target.NVLinkRxBytesAvg = nil
+	} else {
+
+		cc := buff.ReadFloat64() // read float64
+		target.NVLinkRxBytesAvg = &cc
+
+	}
+
 	return nil
 	return nil
 }
 }
 
 
@@ -4129,7 +4634,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 		if buff.ReadUInt8() == uint8(0) {
 		if buff.ReadUInt8() == uint8(0) {
 			target.Metadata = nil
 			target.Metadata = nil
 		} else {
 		} else {
-
 			// --- [begin][read][struct](Metadata) ---
 			// --- [begin][read][struct](Metadata) ---
 			a := new(Metadata)
 			a := new(Metadata)
 			buff.ReadInt() // [compatibility, unused]
 			buff.ReadInt() // [compatibility, unused]
@@ -4147,6 +4651,7 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 	}
 	}
 	// field version check
 	// field version check
 	if uint8(1) <= version {
 	if uint8(1) <= version {
+
 		// --- [begin][read][struct](Window) ---
 		// --- [begin][read][struct](Window) ---
 		b := new(Window)
 		b := new(Window)
 		buff.ReadInt() // [compatibility, unused]
 		buff.ReadInt() // [compatibility, unused]
@@ -4164,7 +4669,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 		if buff.ReadUInt8() == uint8(0) {
 		if buff.ReadUInt8() == uint8(0) {
 			target.Cluster = nil
 			target.Cluster = nil
 		} else {
 		} else {
-
 			// --- [begin][read][struct](Cluster) ---
 			// --- [begin][read][struct](Cluster) ---
 			c := new(Cluster)
 			c := new(Cluster)
 			buff.ReadInt() // [compatibility, unused]
 			buff.ReadInt() // [compatibility, unused]
@@ -4204,7 +4708,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					z = nil
 					z = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](Namespace) ---
 					// --- [begin][read][struct](Namespace) ---
 					l := new(Namespace)
 					l := new(Namespace)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4295,7 +4798,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					zzz = nil
 					zzz = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](Service) ---
 					// --- [begin][read][struct](Service) ---
 					y := new(Service)
 					y := new(Service)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4386,7 +4888,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					zzzzz = nil
 					zzzzz = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](StatefulSet) ---
 					// --- [begin][read][struct](StatefulSet) ---
 					oo := new(StatefulSet)
 					oo := new(StatefulSet)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4432,7 +4933,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					zzzzzz = nil
 					zzzzzz = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](DaemonSet) ---
 					// --- [begin][read][struct](DaemonSet) ---
 					uu := new(DaemonSet)
 					uu := new(DaemonSet)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4478,7 +4978,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					zzzzzzz = nil
 					zzzzzzz = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](Job) ---
 					// --- [begin][read][struct](Job) ---
 					ccc := new(Job)
 					ccc := new(Job)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4524,7 +5023,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					zzzzzzzz = nil
 					zzzzzzzz = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](CronJob) ---
 					// --- [begin][read][struct](CronJob) ---
 					lll := new(CronJob)
 					lll := new(CronJob)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4616,7 +5114,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					zzzzzzzzzz = nil
 					zzzzzzzzzz = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](Node) ---
 					// --- [begin][read][struct](Node) ---
 					yyy := new(Node)
 					yyy := new(Node)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4707,7 +5204,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					zzzzzzzzzzzz = nil
 					zzzzzzzzzzzz = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](PersistentVolumeClaim) ---
 					// --- [begin][read][struct](PersistentVolumeClaim) ---
 					oooo := new(PersistentVolumeClaim)
 					oooo := new(PersistentVolumeClaim)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4844,7 +5340,6 @@ func (target *KubeModelSet) UnmarshalBinaryWithContext(ctx *DecodingContext) (er
 				if buff.ReadUInt8() == uint8(0) {
 				if buff.ReadUInt8() == uint8(0) {
 					zzzzzzzzzzzzzzz = nil
 					zzzzzzzzzzzzzzz = nil
 				} else {
 				} else {
-
 					// --- [begin][read][struct](DCGMDevice) ---
 					// --- [begin][read][struct](DCGMDevice) ---
 					lllll := new(DCGMDevice)
 					lllll := new(DCGMDevice)
 					buff.ReadInt() // [compatibility, unused]
 					buff.ReadInt() // [compatibility, unused]
@@ -4923,7 +5418,7 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 		}
 		}
 
 
 		fi = BingenFieldInfo{
 		fi = BingenFieldInfo{
-			Type: reflect.TypeFor[Metadata](),
+			Type: reflect.TypeFor[*Metadata](),
 			Name: "Metadata",
 			Name: "Metadata",
 		}
 		}
 		// field version check
 		// field version check
@@ -4986,7 +5481,7 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 		}
 		}
 
 
 		fi = BingenFieldInfo{
 		fi = BingenFieldInfo{
-			Type: reflect.TypeFor[Cluster](),
+			Type: reflect.TypeFor[*Cluster](),
 			Name: "Cluster",
 			Name: "Cluster",
 		}
 		}
 		// field version check
 		// field version check
@@ -5052,7 +5547,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						z = nil
 						z = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](Namespace) ---
 						// --- [begin][read][struct](Namespace) ---
 						n := new(Namespace)
 						n := new(Namespace)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -5171,7 +5665,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						zzz = nil
 						zzz = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](Service) ---
 						// --- [begin][read][struct](Service) ---
 						y := new(Service)
 						y := new(Service)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -5290,7 +5783,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						zzzzz = nil
 						zzzzz = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](StatefulSet) ---
 						// --- [begin][read][struct](StatefulSet) ---
 						mm := new(StatefulSet)
 						mm := new(StatefulSet)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -5350,7 +5842,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						zzzzzz = nil
 						zzzzzz = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](DaemonSet) ---
 						// --- [begin][read][struct](DaemonSet) ---
 						rr := new(DaemonSet)
 						rr := new(DaemonSet)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -5410,7 +5901,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						zzzzzzz = nil
 						zzzzzzz = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](Job) ---
 						// --- [begin][read][struct](Job) ---
 						xx := new(Job)
 						xx := new(Job)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -5470,7 +5960,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						zzzzzzzz = nil
 						zzzzzzzz = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](CronJob) ---
 						// --- [begin][read][struct](CronJob) ---
 						ddd := new(CronJob)
 						ddd := new(CronJob)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -5590,7 +6079,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						zzzzzzzzzz = nil
 						zzzzzzzzzz = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](Node) ---
 						// --- [begin][read][struct](Node) ---
 						qqq := new(Node)
 						qqq := new(Node)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -5709,7 +6197,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						zzzzzzzzzzzz = nil
 						zzzzzzzzzzzz = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](PersistentVolumeClaim) ---
 						// --- [begin][read][struct](PersistentVolumeClaim) ---
 						cccc := new(PersistentVolumeClaim)
 						cccc := new(PersistentVolumeClaim)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -5888,7 +6375,6 @@ func (stream *KubeModelSetStream) Stream() iter.Seq2[BingenFieldInfo, *BingenVal
 					if buff.ReadUInt8() == uint8(0) {
 					if buff.ReadUInt8() == uint8(0) {
 						zzzzzzzzzzzzzzz = nil
 						zzzzzzzzzzzzzzz = nil
 					} else {
 					} else {
-
 						// --- [begin][read][struct](DCGMDevice) ---
 						// --- [begin][read][struct](DCGMDevice) ---
 						uuuu := new(DCGMDevice)
 						uuuu := new(DCGMDevice)
 						buff.ReadInt() // [compatibility, unused]
 						buff.ReadInt() // [compatibility, unused]
@@ -9474,7 +9960,6 @@ func (target *ResourceQuotaSpecHard) UnmarshalBinaryWithContext(ctx *DecodingCon
 
 
 	// field version check
 	// field version check
 	if uint8(1) <= version {
 	if uint8(1) <= version {
-
 		// --- [begin][read][alias](ResourceQuantities) ---
 		// --- [begin][read][alias](ResourceQuantities) ---
 		var a map[Resource]ResourceQuantity
 		var a map[Resource]ResourceQuantity
 		if buff.ReadUInt8() == uint8(0) {
 		if buff.ReadUInt8() == uint8(0) {
@@ -9523,7 +10008,6 @@ func (target *ResourceQuotaSpecHard) UnmarshalBinaryWithContext(ctx *DecodingCon
 	}
 	}
 	// field version check
 	// field version check
 	if uint8(1) <= version {
 	if uint8(1) <= version {
-
 		// --- [begin][read][alias](ResourceQuantities) ---
 		// --- [begin][read][alias](ResourceQuantities) ---
 		var l map[Resource]ResourceQuantity
 		var l map[Resource]ResourceQuantity
 		if buff.ReadUInt8() == uint8(0) {
 		if buff.ReadUInt8() == uint8(0) {
@@ -9870,7 +10354,6 @@ func (target *ResourceQuotaStatusUsed) UnmarshalBinaryWithContext(ctx *DecodingC
 
 
 	// field version check
 	// field version check
 	if uint8(1) <= version {
 	if uint8(1) <= version {
-
 		// --- [begin][read][alias](ResourceQuantities) ---
 		// --- [begin][read][alias](ResourceQuantities) ---
 		var a map[Resource]ResourceQuantity
 		var a map[Resource]ResourceQuantity
 		if buff.ReadUInt8() == uint8(0) {
 		if buff.ReadUInt8() == uint8(0) {
@@ -9919,7 +10402,6 @@ func (target *ResourceQuotaStatusUsed) UnmarshalBinaryWithContext(ctx *DecodingC
 	}
 	}
 	// field version check
 	// field version check
 	if uint8(1) <= version {
 	if uint8(1) <= version {
-
 		// --- [begin][read][alias](ResourceQuantities) ---
 		// --- [begin][read][alias](ResourceQuantities) ---
 		var l map[Resource]ResourceQuantity
 		var l map[Resource]ResourceQuantity
 		if buff.ReadUInt8() == uint8(0) {
 		if buff.ReadUInt8() == uint8(0) {

+ 164 - 0
docs/gpu-saturation.md

@@ -0,0 +1,164 @@
+# GPU Saturation Signals
+
+OpenCost derives GPU **saturation** signals from
+[dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) metrics, following
+the [USE method](https://www.brendangregg.com/usemethod.html):
+
+- **Utilization** — how busy the GPU was (already exposed as
+  `gpuAllocation.gpuUsageAverage` from `DCGM_FI_PROF_GR_ENGINE_ACTIVE`).
+- **Saturation** — work that was queued, rejected, or slowed because the GPU
+  could not service demand. That is what the signals below report.
+
+Every signal is an independent primitive. OpenCost deliberately does **not**
+compute a composite saturation score; consumers combine the primitives as
+they see fit.
+
+## Absence semantics
+
+A missing field always means *the source metric was unavailable* — no
+dcgm-exporter in the cluster, the DCGM field is not in the exporter's
+configuration, or the GPU lacks DCP profiling support. OpenCost never emits
+a zero in place of missing data, so `0` can be trusted to mean "observed,
+and not saturated".
+
+## Allocation API
+
+Saturation appears on the Allocation API under
+`gpuAllocation.saturation`, per container:
+
+```json
+{
+  "gpuAllocation": {
+    "gpuDevice": "nvidia0",
+    "gpuModel": "Tesla T4",
+    "gpuUUID": "GPU-...",
+    "saturation": {
+      "throttleViolationRatios": { "power": 0.12, "thermal": 0.01 },
+      "throttleReasonRatios": { "sw_power_cap": 0.15 },
+      "memoryUsedRatioAvg": 0.81,
+      "memoryUsedRatioMax": 0.97,
+      "memoryPressureRatio": 0.25,
+      "xidErrorCount": 0,
+      "dramActiveAvg": 0.62,
+      "smActiveAvg": 0.55,
+      "smOccupancyAvg": 0.31,
+      "pcieTxBytesAvg": 1.2e9,
+      "pcieRxBytesAvg": 2.0e9
+    }
+  }
+}
+```
+
+Controlled by `GPU_SATURATION_METRICS_ENABLED` (default `true`).
+
+## Signal reference
+
+Ratios are fractions of the queried window in `[0, 1]` unless noted.
+
+| Field | DCGM source | In default dcgm-exporter config? | Needs DCP? | Meaning |
+|---|---|---|---|---|
+| `throttleViolationRatios` (`power`, `thermal`, `sync_boost`, `board_limit`) | `DCGM_FI_DEV_POWER_VIOLATION`, `DCGM_FI_DEV_THERMAL_VIOLATION`, `DCGM_FI_DEV_SYNC_BOOST_VIOLATION`, `DCGM_FI_DEV_BOARD_LIMIT_VIOLATION` | Yes | No | Fraction of the window the GPU spent throttled for the reason, from cumulative microsecond violation counters. The strongest direct saturation signal available by default. |
+| `throttleReasonRatios` (`sw_power_cap`, `hw_slowdown`, `sync_boost`, `sw_thermal`, `hw_thermal`, `hw_power_brake`) | `DCGM_FI_DEV_CLOCK_THROTTLE_REASONS` (renamed `DCGM_FI_DEV_CLOCKS_EVENT_REASONS` in DCGM 3.3+; OpenCost queries both) | **No — must be enabled** | No | Fraction of samples in which each saturation-relevant bit of the NVML throttle-reasons bitmask was set. Richer reason breakdown than the violation counters (hardware slowdown, power brake). Idle/configured-clock bits are excluded by design. Reported for the whole physical GPU even under MIG or time-slicing. |
+| `memoryUsedRatioAvg`, `memoryUsedRatioMax` | `DCGM_FI_DEV_FB_USED`, `DCGM_FI_DEV_FB_FREE` | Yes | No | Framebuffer occupancy `used / (used + free)`. Sustained values near 1.0 mean new allocations are likely to fail or force eviction. |
+| `memoryPressureRatio` | same | Yes | No | Fraction of the window occupancy was at or above the configured threshold (`GPU_MEMORY_SATURATION_THRESHOLD`, default `0.9`). |
+| `xidErrorCount` | `DCGM_FI_DEV_XID_ERRORS` | Yes | No | XID error events observed in the window, a rejected-work signal. The DCGM field reports the *last* XID code, so consecutive identical errors are undercounted. |
+| `dramActiveAvg`, `dramActiveMax` | `DCGM_FI_PROF_DRAM_ACTIVE` | Yes | **Yes** | Ratio of cycles the device memory interface was active. Near-ceiling values with low `smOccupancyAvg` indicate a memory-bandwidth-bound workload. |
+| `smActiveAvg` | `DCGM_FI_PROF_SM_ACTIVE` | **No — must be enabled** | **Yes** | Ratio of cycles at least one warp was resident on any SM. |
+| `smOccupancyAvg` | `DCGM_FI_PROF_SM_OCCUPANCY` | **No — must be enabled** | **Yes** | Ratio of resident warps to the SM maximum. Together with `smActiveAvg` and `dramActiveAvg`, lets consumers distinguish compute-bound vs bandwidth-bound vs latency-bound saturation. |
+| `pcieTxBytesAvg`, `pcieRxBytesAvg` | `DCGM_FI_PROF_PCIE_TX_BYTES`, `DCGM_FI_PROF_PCIE_RX_BYTES` | Yes | **Yes** | Average PCIe throughput in bytes/sec. DCGM does not export link capacity, so these are raw rates; deriving a capacity ratio is future work. |
+| `nvlinkTxBytesAvg`, `nvlinkRxBytesAvg` | `DCGM_FI_PROF_NVLINK_TX_BYTES`, `DCGM_FI_PROF_NVLINK_RX_BYTES` | **No — must be enabled** | **Yes** | Average NVLink throughput in bytes/sec; same capacity caveat as PCIe. |
+
+"Needs DCP" means the field comes from DCGM's profiling module (DCP), which
+requires Volta or newer GPUs and is unavailable in some virtualized
+environments. "Must be enabled" means the field exists in the dcgm-exporter
+default configuration file but is commented out (or absent), and must be
+uncommented/added for the signal to appear.
+
+### Interpretation guidance (USE)
+
+- High utilization with **zero** throttle/pressure ratios: the GPU is busy
+  but keeping up — buying a faster GPU may not help.
+- `throttleViolationRatios.power` or `sw_power_cap` sustained above zero:
+  the GPU is power-limited; demand exceeds what the power envelope can
+  service.
+- `memoryUsedRatioMax` near 1.0 or `memoryPressureRatio` > 0 alongside
+  `xidErrorCount` > 0: work is likely being rejected (OOM-style failures).
+- `dramActiveAvg` near ceiling with low `smOccupancyAvg`: bandwidth-bound;
+  with high `smOccupancyAvg`: genuinely compute-saturated.
+
+### Attribution caveat
+
+Device-level signals (throttling, framebuffer, XID) are attributed to
+containers via the pod labels dcgm-exporter attaches. For exclusively
+assigned GPUs this is exact. For time-sliced or MPS-shared GPUs, every
+sharing container sees the same device-level saturation; the signal tells
+you the *device* was saturated, not which tenant caused it. MIG slices are
+reported as distinct devices (dcgm-exporter `GPU_I_ID` / `GPU_I_PROFILE`
+labels), except the throttle-reasons bitmask, which is physical-GPU-scoped.
+
+## Scheduler-level saturation metrics
+
+DCGM cannot see work that never reached a GPU. OpenCost additionally emits
+cluster-scoped gauges on `/metrics` from the Kubernetes scheduler's view,
+one series per observed GPU resource name (`nvidia.com/gpu`,
+`nvidia.com/gpu.shared`, `nvidia.com/mig-*`):
+
+| Metric | Meaning |
+|---|---|
+| `cluster_gpu_pending_pod_count{resource}` | Pods in `Pending` phase requesting the resource. |
+| `cluster_gpu_pending_request_total{resource}` | GPU units requested by those pending pods. |
+| `cluster_gpu_requested_allocatable_ratio{resource}` | Units requested by all non-terminated pods divided by allocatable units. Values near or above 1 indicate scheduler-level saturation, including exhaustion of time-sliced/MPS replicas. Only emitted when allocatable capacity exists. |
+
+These can be disabled individually through the standard metrics
+configuration (the same mechanism as `node_gpu_count` et al.).
+
+## Configuration
+
+| Variable | Default | Description |
+|---|---|---|
+| `GPU_SATURATION_METRICS_ENABLED` | `true` | Query and apply GPU saturation signals in the Allocation pipeline. |
+| `GPU_MEMORY_SATURATION_THRESHOLD` | `0.9` | Framebuffer occupancy ratio above which the GPU counts as memory-pressured. Values outside `(0, 1]` fall back to the default. |
+
+## Data source differences
+
+- **Prometheus source**: the throttle-bitmask and memory-pressure signals
+  use PromQL subqueries at the configured query resolution.
+- **Collector source**: framebuffer occupancy is joined from FB_USED and
+  FB_FREE per scrape by a metric synthesizer, producing a per-sample ratio
+  that the occupancy and pressure aggregations consume. All signals are
+  supported.
+
+## Allocation half: DRA
+
+Telemetry reports what devices *did*; Dynamic Resource Allocation
+(`resource.k8s.io/v1`, k8s 1.34+) reports what was *requested, allocated,
+and reserved*. The kubemodel carries both:
+
+- `ResourceSlices` — driver-advertised device capacity per node/pool,
+  including driver-published attributes and capacity quantities.
+- `ResourceClaims` — device requests (class, count), scheduler allocations
+  (driver/pool/device), and the pods that reserved them
+  (`reservedForPodUids`). A reserved-but-idle device appears here even
+  though it never shows in DCGM usage.
+- Hydration joins the halves: each allocated device's UUID is resolved from
+  its slice attributes (`uuid`, or driver-qualified `*/uuid`), matching
+  `DCGMDevice.UUID` so claims link to telemetry directly.
+
+Claims and slices are cluster state, not time series: the model carries the
+state observed at hydration time. Clusters without the DRA API (or without
+RBAC) hydrate nothing — absence, not zeros.
+
+**RBAC**: the OpenCost service account needs `list`/`watch` on
+`resourceclaims` and `resourceslices` in the `resource.k8s.io` API group
+(helm chart follow-up).
+
+## Future work
+
+- Non-NVIDIA GPUs (AMD ROCm SMI exporter, Intel XPU manager) — the signal
+  taxonomy is vendor-neutral, the queries are not.
+- PCIe/NVLink capacity ratios, once link capacity can be derived per model.
+- Device-level saturation in the kubemodel pipeline is modeled on the
+  vendor-specific device type (`DCGMDevice.Saturation`) behind the
+  vendor-neutral `DeviceInfo`/`DevicePerformance`/`DeviceSaturation`
+  interfaces, but not yet populated; it will be wired into the DCGM
+  hydration path alongside the existing device usage collection.

+ 10 - 0
pkg/env/costmodel.go

@@ -93,6 +93,8 @@ const (
 
 
 	CarbonEstimatesEnabledEnvVar = "CARBON_ESTIMATES_ENABLED"
 	CarbonEstimatesEnabledEnvVar = "CARBON_ESTIMATES_ENABLED"
 
 
+	GPUSaturationMetricsEnabledEnvVar = "GPU_SATURATION_METRICS_ENABLED"
+
 	KubernetesResourceAccessEnvVar = "KUBERNETES_RESOURCE_ACCESS"
 	KubernetesResourceAccessEnvVar = "KUBERNETES_RESOURCE_ACCESS"
 	UseCacheV1                     = "USE_CACHE_V1"
 	UseCacheV1                     = "USE_CACHE_V1"
 
 
@@ -384,6 +386,14 @@ func IsCarbonEstimatesEnabled() bool {
 	return env.GetBool(CarbonEstimatesEnabledEnvVar, false)
 	return env.GetBool(CarbonEstimatesEnabledEnvVar, false)
 }
 }
 
 
+// IsGPUSaturationMetricsEnabled controls whether the kubemodel hydration
+// queries and applies GPU saturation signals (USE method) derived from DCGM
+// exporter metrics. Enabled by default; clusters without dcgm-exporter
+// simply receive empty results.
+func IsGPUSaturationMetricsEnabled() bool {
+	return env.GetBool(GPUSaturationMetricsEnabledEnvVar, true)
+}
+
 // HasKubernetesResourceAccess can be set to false if Opencost is run without access to the kubernetes resources
 // HasKubernetesResourceAccess can be set to false if Opencost is run without access to the kubernetes resources
 func HasKubernetesResourceAccess() bool { return env.GetBool(KubernetesResourceAccessEnvVar, true) }
 func HasKubernetesResourceAccess() bool { return env.GetBool(KubernetesResourceAccessEnvVar, true) }
 
 

+ 199 - 0
pkg/kubemodel/dcgm_saturation.go

@@ -0,0 +1,199 @@
+package kubemodel
+
+import (
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/model/kubemodel"
+	"github.com/opencost/opencost/core/pkg/source"
+)
+
+// DCGM device saturation hydration. The saturation queries are
+// container-attributed series (dcgm-exporter duplicates each device-level
+// value onto every pod sharing the device), so hydration reduces them to
+// one value per device UUID. MIG instances surface as their own UUIDs when
+// the exporter runs in MIG mode, so slices hydrate as distinct devices and
+// PodUsages preserves the container-to-slice association.
+
+// dcgmSaturationFutures holds the in-flight saturation queries for one
+// computeDCGMDevices pass, mirroring the bundle used by ComputeAllocation.
+type dcgmSaturationFutures struct {
+	throttleViolation *source.QueryGroupFuture[source.GPUSaturationResult]
+	throttleReason    *source.QueryGroupFuture[source.GPUSaturationResult]
+	memoryUsedAvg     *source.QueryGroupFuture[source.GPUSaturationResult]
+	memoryUsedMax     *source.QueryGroupFuture[source.GPUSaturationResult]
+	memoryPressure    *source.QueryGroupFuture[source.GPUSaturationResult]
+	xidErrorCount     *source.QueryGroupFuture[source.GPUSaturationResult]
+	dramActiveAvg     *source.QueryGroupFuture[source.GPUSaturationResult]
+	dramActiveMax     *source.QueryGroupFuture[source.GPUSaturationResult]
+	smActiveAvg       *source.QueryGroupFuture[source.GPUSaturationResult]
+	smOccupancyAvg    *source.QueryGroupFuture[source.GPUSaturationResult]
+	pcieTxBytesAvg    *source.QueryGroupFuture[source.GPUSaturationResult]
+	pcieRxBytesAvg    *source.QueryGroupFuture[source.GPUSaturationResult]
+	nvlinkTxBytesAvg  *source.QueryGroupFuture[source.GPUSaturationResult]
+	nvlinkRxBytesAvg  *source.QueryGroupFuture[source.GPUSaturationResult]
+}
+
+func startDCGMSaturationQueries(grp *source.QueryGroup, metrics source.MetricsQuerier, start, end time.Time) *dcgmSaturationFutures {
+	return &dcgmSaturationFutures{
+		throttleViolation: source.WithGroup(grp, metrics.QueryGPUThrottleViolationRatio(start, end)),
+		throttleReason:    source.WithGroup(grp, metrics.QueryGPUThrottleReasonRatio(start, end)),
+		memoryUsedAvg:     source.WithGroup(grp, metrics.QueryGPUMemoryUsedRatioAvg(start, end)),
+		memoryUsedMax:     source.WithGroup(grp, metrics.QueryGPUMemoryUsedRatioMax(start, end)),
+		memoryPressure:    source.WithGroup(grp, metrics.QueryGPUMemoryPressureRatio(start, end)),
+		xidErrorCount:     source.WithGroup(grp, metrics.QueryGPUXIDErrorCount(start, end)),
+		dramActiveAvg:     source.WithGroup(grp, metrics.QueryGPUDRAMActiveAvg(start, end)),
+		dramActiveMax:     source.WithGroup(grp, metrics.QueryGPUDRAMActiveMax(start, end)),
+		smActiveAvg:       source.WithGroup(grp, metrics.QueryGPUSMActiveAvg(start, end)),
+		smOccupancyAvg:    source.WithGroup(grp, metrics.QueryGPUSMOccupancyAvg(start, end)),
+		pcieTxBytesAvg:    source.WithGroup(grp, metrics.QueryGPUPCIeTxBytesAvg(start, end)),
+		pcieRxBytesAvg:    source.WithGroup(grp, metrics.QueryGPUPCIeRxBytesAvg(start, end)),
+		nvlinkTxBytesAvg:  source.WithGroup(grp, metrics.QueryGPUNVLinkTxBytesAvg(start, end)),
+		nvlinkRxBytesAvg:  source.WithGroup(grp, metrics.QueryGPUNVLinkRxBytesAvg(start, end)),
+	}
+}
+
+// awaitAndApply awaits every saturation query and reduces the results onto
+// the device map. Per-future errors are recorded in the query group by
+// Await, matching how the rest of computeDCGMDevices treats its queries.
+func (f *dcgmSaturationFutures) awaitAndApply(deviceMap map[string]*kubemodel.DCGMDevice) {
+	if f == nil {
+		return
+	}
+
+	resThrottleViolation, _ := f.throttleViolation.Await()
+	resThrottleReason, _ := f.throttleReason.Await()
+	resMemoryUsedAvg, _ := f.memoryUsedAvg.Await()
+	resMemoryUsedMax, _ := f.memoryUsedMax.Await()
+	resMemoryPressure, _ := f.memoryPressure.Await()
+	resXIDErrorCount, _ := f.xidErrorCount.Await()
+	resDRAMActiveAvg, _ := f.dramActiveAvg.Await()
+	resDRAMActiveMax, _ := f.dramActiveMax.Await()
+	resSMActiveAvg, _ := f.smActiveAvg.Await()
+	resSMOccupancyAvg, _ := f.smOccupancyAvg.Await()
+	resPCIeTxBytesAvg, _ := f.pcieTxBytesAvg.Await()
+	resPCIeRxBytesAvg, _ := f.pcieRxBytesAvg.Await()
+	resNVLinkTxBytesAvg, _ := f.nvlinkTxBytesAvg.Await()
+	resNVLinkRxBytesAvg, _ := f.nvlinkRxBytesAvg.Await()
+
+	applyDeviceThrottleRatios(deviceMap, resThrottleViolation, func(sat *kubemodel.DCGMDeviceSaturation) map[string]float64 {
+		if sat.ThrottleViolationRatios == nil {
+			sat.ThrottleViolationRatios = make(map[string]float64)
+		}
+		return sat.ThrottleViolationRatios
+	})
+	applyDeviceThrottleRatios(deviceMap, resThrottleReason, func(sat *kubemodel.DCGMDeviceSaturation) map[string]float64 {
+		if sat.ThrottleReasonRatios == nil {
+			sat.ThrottleReasonRatios = make(map[string]float64)
+		}
+		return sat.ThrottleReasonRatios
+	})
+	applyDeviceSaturationScalar(deviceMap, resMemoryUsedAvg, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.MemoryUsedRatioAvg = &v })
+	applyDeviceSaturationScalar(deviceMap, resMemoryUsedMax, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.MemoryUsedRatioMax = &v })
+	applyDeviceSaturationScalar(deviceMap, resMemoryPressure, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.MemoryPressureRatio = &v })
+	applyDeviceSaturationScalar(deviceMap, resXIDErrorCount, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.XIDErrorCount = &v })
+	applyDeviceSaturationScalar(deviceMap, resDRAMActiveAvg, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.DRAMActiveAvg = &v })
+	applyDeviceSaturationScalar(deviceMap, resDRAMActiveMax, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.DRAMActiveMax = &v })
+	applyDeviceSaturationScalar(deviceMap, resSMActiveAvg, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.SMActiveAvg = &v })
+	applyDeviceSaturationScalar(deviceMap, resSMOccupancyAvg, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.SMOccupancyAvg = &v })
+	applyDeviceSaturationScalar(deviceMap, resPCIeTxBytesAvg, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.PCIeTxBytesAvg = &v })
+	applyDeviceSaturationScalar(deviceMap, resPCIeRxBytesAvg, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.PCIeRxBytesAvg = &v })
+	applyDeviceSaturationScalar(deviceMap, resNVLinkTxBytesAvg, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.NVLinkTxBytesAvg = &v })
+	applyDeviceSaturationScalar(deviceMap, resNVLinkRxBytesAvg, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.NVLinkRxBytesAvg = &v })
+}
+
+// ensureDeviceSaturation lazily creates the saturation struct, so devices
+// with no signals keep Saturation nil (absence is never zero).
+func ensureDeviceSaturation(device *kubemodel.DCGMDevice) *kubemodel.DCGMDeviceSaturation {
+	if device.Saturation == nil {
+		device.Saturation = &kubemodel.DCGMDeviceSaturation{}
+	}
+	return device.Saturation
+}
+
+// applyDeviceSaturationScalar reduces container-attributed results to one
+// value per device UUID. Sharing containers carry duplicates of the same
+// device-level value, so last-write-wins is exact; results for UUIDs the
+// info query did not report are skipped.
+func applyDeviceSaturationScalar(deviceMap map[string]*kubemodel.DCGMDevice, results []*source.GPUSaturationResult, set func(sat *kubemodel.DCGMDeviceSaturation, value float64)) {
+	for _, res := range results {
+		device, ok := deviceMap[res.UUID]
+		if !ok || len(res.Data) == 0 {
+			continue
+		}
+		set(ensureDeviceSaturation(device), res.Data[0].Value)
+	}
+}
+
+// applyDeviceThrottleRatios reduces reason-labeled throttle results onto the
+// device's reason map.
+func applyDeviceThrottleRatios(deviceMap map[string]*kubemodel.DCGMDevice, results []*source.GPUSaturationResult, ratios func(sat *kubemodel.DCGMDeviceSaturation) map[string]float64) {
+	for _, res := range results {
+		device, ok := deviceMap[res.UUID]
+		if !ok || len(res.Data) == 0 || res.Reason == "" {
+			continue
+		}
+		ratios(ensureDeviceSaturation(device))[res.Reason] = res.Data[0].Value
+	}
+}
+
+// dcgmDeviceMetricFutures holds the device-level metric queries backing
+// DeviceInfo / DevicePerformance: power, temperature, device-level compute
+// utilization, and framebuffer used. Unlike saturation these are not
+// feature-gated: they are core device telemetry from the default
+// dcgm-exporter configuration.
+type dcgmDeviceMetricFutures struct {
+	powerAvg      *source.QueryGroupFuture[source.GPUDeviceMetricResult]
+	tempAvg       *source.QueryGroupFuture[source.GPUDeviceMetricResult]
+	usageAvg      *source.QueryGroupFuture[source.GPUDeviceMetricResult]
+	usageMax      *source.QueryGroupFuture[source.GPUDeviceMetricResult]
+	memoryUsedAvg *source.QueryGroupFuture[source.GPUDeviceMetricResult]
+	memoryUsedMax *source.QueryGroupFuture[source.GPUDeviceMetricResult]
+}
+
+func startDCGMDeviceMetricQueries(grp *source.QueryGroup, metrics source.MetricsQuerier, start, end time.Time) *dcgmDeviceMetricFutures {
+	return &dcgmDeviceMetricFutures{
+		powerAvg:      source.WithGroup(grp, metrics.QueryGPUDevicePowerAvg(start, end)),
+		tempAvg:       source.WithGroup(grp, metrics.QueryGPUDeviceTempAvg(start, end)),
+		usageAvg:      source.WithGroup(grp, metrics.QueryGPUDeviceUsageAvg(start, end)),
+		usageMax:      source.WithGroup(grp, metrics.QueryGPUDeviceUsageMax(start, end)),
+		memoryUsedAvg: source.WithGroup(grp, metrics.QueryGPUDeviceMemoryUsedAvg(start, end)),
+		memoryUsedMax: source.WithGroup(grp, metrics.QueryGPUDeviceMemoryUsedMax(start, end)),
+	}
+}
+
+const fbMiB = 1024 * 1024
+
+// awaitAndApply reduces the device-level metrics onto the device map,
+// scaling to the model's units: GR_ENGINE_ACTIVE ratio to percent (0-100),
+// FB_USED MiB to bytes.
+func (f *dcgmDeviceMetricFutures) awaitAndApply(deviceMap map[string]*kubemodel.DCGMDevice) {
+	if f == nil {
+		return
+	}
+
+	resPower, _ := f.powerAvg.Await()
+	resTemp, _ := f.tempAvg.Await()
+	resUsageAvg, _ := f.usageAvg.Await()
+	resUsageMax, _ := f.usageMax.Await()
+	resMemAvg, _ := f.memoryUsedAvg.Await()
+	resMemMax, _ := f.memoryUsedMax.Await()
+
+	applyDeviceMetric(deviceMap, resPower, func(d *kubemodel.DCGMDevice, v float64) { d.PowerWatts = v })
+	applyDeviceMetric(deviceMap, resTemp, func(d *kubemodel.DCGMDevice, v float64) { d.TemperatureCelsius = v })
+	applyDeviceMetric(deviceMap, resUsageAvg, func(d *kubemodel.DCGMDevice, v float64) { d.ComputeUtilizationAvg = v * 100 })
+	applyDeviceMetric(deviceMap, resUsageMax, func(d *kubemodel.DCGMDevice, v float64) { d.ComputeUtilizationMax = v * 100 })
+	applyDeviceMetric(deviceMap, resMemAvg, func(d *kubemodel.DCGMDevice, v float64) { d.MemoryUsedBytesAvg = v * fbMiB })
+	applyDeviceMetric(deviceMap, resMemMax, func(d *kubemodel.DCGMDevice, v float64) { d.MemoryUsedBytesMax = v * fbMiB })
+}
+
+// applyDeviceMetric reduces device-keyed results onto device fields;
+// unknown UUIDs and empty results are skipped.
+func applyDeviceMetric(deviceMap map[string]*kubemodel.DCGMDevice, results []*source.GPUDeviceMetricResult, set func(d *kubemodel.DCGMDevice, value float64)) {
+	for _, res := range results {
+		device, ok := deviceMap[res.UUID]
+		if !ok || len(res.Data) == 0 {
+			continue
+		}
+		set(device, res.Data[0].Value)
+	}
+}

+ 222 - 0
pkg/kubemodel/dcgm_saturation_test.go

@@ -0,0 +1,222 @@
+package kubemodel
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/model/kubemodel"
+	"github.com/opencost/opencost/core/pkg/source"
+	"github.com/opencost/opencost/core/pkg/util"
+)
+
+func saturationDeviceMap() map[string]*kubemodel.DCGMDevice {
+	return map[string]*kubemodel.DCGMDevice{
+		"GPU-1": {UUID: "GPU-1", Device: "nvidia0", ModelName: "Tesla T4"},
+		"GPU-2": {UUID: "GPU-2", Device: "nvidia1", ModelName: "Tesla T4"},
+	}
+}
+
+func deviceSaturationResult(uuid, container, reason string, value float64) *source.GPUSaturationResult {
+	return &source.GPUSaturationResult{
+		UUID:      uuid,
+		Container: container,
+		Pod:       "pod-" + container,
+		Namespace: "ns",
+		Reason:    reason,
+		Data:      []*util.Vector{{Value: value}},
+	}
+}
+
+func TestApplyDeviceSaturationScalar(t *testing.T) {
+	deviceMap := saturationDeviceMap()
+
+	results := []*source.GPUSaturationResult{
+		// dcgm-exporter duplicates the device-level value onto every
+		// container sharing the device: both rows carry the same value
+		deviceSaturationResult("GPU-1", "container-a", "", 0.85),
+		deviceSaturationResult("GPU-1", "container-b", "", 0.85),
+		// unknown device must be skipped
+		deviceSaturationResult("GPU-9", "container-c", "", 0.5),
+	}
+	noData := deviceSaturationResult("GPU-2", "container-d", "", 0)
+	noData.Data = nil
+	results = append(results, noData)
+
+	applyDeviceSaturationScalar(deviceMap, results, func(sat *kubemodel.DCGMDeviceSaturation, v float64) { sat.MemoryUsedRatioAvg = &v })
+
+	sat := deviceMap["GPU-1"].Saturation
+	if sat == nil || sat.MemoryUsedRatioAvg == nil || *sat.MemoryUsedRatioAvg != 0.85 {
+		t.Errorf("GPU-1 saturation = %+v, want MemoryUsedRatioAvg 0.85", sat)
+	}
+	// device with only an empty-data result must keep Saturation nil
+	if deviceMap["GPU-2"].Saturation != nil {
+		t.Errorf("GPU-2 saturation should stay nil for empty results, got %+v", deviceMap["GPU-2"].Saturation)
+	}
+}
+
+func TestApplyDeviceThrottleRatios(t *testing.T) {
+	deviceMap := saturationDeviceMap()
+
+	applyDeviceThrottleRatios(deviceMap, []*source.GPUSaturationResult{
+		deviceSaturationResult("GPU-1", "container-a", "power", 0.25),
+		deviceSaturationResult("GPU-1", "container-a", "thermal", 0.1),
+		// duplicate attribution from a sharing container, same value
+		deviceSaturationResult("GPU-1", "container-b", "power", 0.25),
+		// missing reason must be dropped without creating saturation
+		deviceSaturationResult("GPU-2", "container-c", "", 0.5),
+	}, func(sat *kubemodel.DCGMDeviceSaturation) map[string]float64 {
+		if sat.ThrottleViolationRatios == nil {
+			sat.ThrottleViolationRatios = make(map[string]float64)
+		}
+		return sat.ThrottleViolationRatios
+	})
+
+	sat := deviceMap["GPU-1"].Saturation
+	if sat == nil {
+		t.Fatalf("expected saturation on GPU-1")
+	}
+	if len(sat.ThrottleViolationRatios) != 2 || sat.ThrottleViolationRatios["power"] != 0.25 || sat.ThrottleViolationRatios["thermal"] != 0.1 {
+		t.Errorf("ThrottleViolationRatios = %v", sat.ThrottleViolationRatios)
+	}
+	if deviceMap["GPU-2"].Saturation != nil {
+		t.Errorf("reasonless result must not create saturation, got %+v", deviceMap["GPU-2"].Saturation)
+	}
+}
+
+// TestDCGMSaturationAwaitAndApply runs the full reduction over a fabricated
+// future bundle, verifying every signal lands on the right device and that
+// devices without signals keep Saturation nil.
+func TestDCGMSaturationAwaitAndApply(t *testing.T) {
+	makeFuture := func(results ...*source.GPUSaturationResult) *source.QueryGroupFuture[source.GPUSaturationResult] {
+		queryResults := source.NewQueryResults("test")
+		for _, res := range results {
+			metrics := map[string]any{
+				"UUID":      res.UUID,
+				"container": res.Container,
+				"pod":       res.Pod,
+				"namespace": res.Namespace,
+			}
+			if res.Reason != "" {
+				metrics["reason"] = res.Reason
+			}
+			queryResults.Results = append(queryResults.Results, source.NewQueryResult(metrics, res.Data, nil))
+		}
+		ch := make(source.QueryResultsChan, 1)
+		ch <- queryResults
+		grp := source.NewQueryGroup()
+		return source.WithGroup(grp, source.NewFuture(source.DecodeGPUSaturationResult, ch))
+	}
+
+	deviceMap := saturationDeviceMap()
+	futures := &dcgmSaturationFutures{
+		throttleViolation: makeFuture(deviceSaturationResult("GPU-1", "c", "power", 0.25)),
+		throttleReason:    makeFuture(deviceSaturationResult("GPU-1", "c", "sw_power_cap", 0.2)),
+		memoryUsedAvg:     makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.85)),
+		memoryUsedMax:     makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.99)),
+		memoryPressure:    makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.4)),
+		xidErrorCount:     makeFuture(deviceSaturationResult("GPU-1", "c", "", 2)),
+		dramActiveAvg:     makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.7)),
+		dramActiveMax:     makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.95)),
+		smActiveAvg:       makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.6)),
+		smOccupancyAvg:    makeFuture(deviceSaturationResult("GPU-1", "c", "", 0.5)),
+		pcieTxBytesAvg:    makeFuture(deviceSaturationResult("GPU-1", "c", "", 1.5e9)),
+		pcieRxBytesAvg:    makeFuture(deviceSaturationResult("GPU-1", "c", "", 2.5e9)),
+		nvlinkTxBytesAvg:  makeFuture(),
+		nvlinkRxBytesAvg:  makeFuture(),
+	}
+
+	futures.awaitAndApply(deviceMap)
+
+	sat := deviceMap["GPU-1"].Saturation
+	if sat == nil {
+		t.Fatalf("expected saturation on GPU-1")
+	}
+	if err := sat.Validate(); err != nil {
+		t.Fatalf("hydrated saturation invalid: %v", err)
+	}
+	if sat.ThrottleViolationRatios["power"] != 0.25 || sat.ThrottleReasonRatios["sw_power_cap"] != 0.2 {
+		t.Errorf("throttle ratios = %v / %v", sat.ThrottleViolationRatios, sat.ThrottleReasonRatios)
+	}
+	scalarChecks := map[string]*float64{
+		"MemoryUsedRatioAvg=0.85": sat.MemoryUsedRatioAvg,
+		"MemoryUsedRatioMax=0.99": sat.MemoryUsedRatioMax,
+		"MemoryPressureRatio=0.4": sat.MemoryPressureRatio,
+		"XIDErrorCount=2":         sat.XIDErrorCount,
+		"DRAMActiveAvg=0.7":       sat.DRAMActiveAvg,
+		"DRAMActiveMax=0.95":      sat.DRAMActiveMax,
+		"SMActiveAvg=0.6":         sat.SMActiveAvg,
+		"SMOccupancyAvg=0.5":      sat.SMOccupancyAvg,
+		"PCIeTxBytesAvg=1.5e9":    sat.PCIeTxBytesAvg,
+		"PCIeRxBytesAvg=2.5e9":    sat.PCIeRxBytesAvg,
+	}
+	for name, ptr := range scalarChecks {
+		if ptr == nil {
+			t.Errorf("%s: signal missing", name)
+		}
+	}
+	// NVLink queries returned nothing: absent, not zero
+	if sat.NVLinkTxBytesAvg != nil || sat.NVLinkRxBytesAvg != nil {
+		t.Errorf("expected NVLink signals to stay nil, got %v / %v", sat.NVLinkTxBytesAvg, sat.NVLinkRxBytesAvg)
+	}
+	// no signals targeted GPU-2: Saturation stays nil
+	if deviceMap["GPU-2"].Saturation != nil {
+		t.Errorf("GPU-2 saturation should be nil, got %+v", deviceMap["GPU-2"].Saturation)
+	}
+
+	// nil bundle (feature disabled) must be a no-op
+	var disabled *dcgmSaturationFutures
+	disabled.awaitAndApply(deviceMap)
+}
+
+// TestDCGMDeviceMetricAwaitAndApply verifies device-level metrics land with
+// correct unit scaling and that devices without series keep zero values.
+func TestDCGMDeviceMetricAwaitAndApply(t *testing.T) {
+	makeFuture := func(results ...*source.GPUDeviceMetricResult) *source.QueryGroupFuture[source.GPUDeviceMetricResult] {
+		queryResults := source.NewQueryResults("test")
+		for _, res := range results {
+			queryResults.Results = append(queryResults.Results, source.NewQueryResult(map[string]any{
+				"UUID": res.UUID,
+			}, res.Data, nil))
+		}
+		ch := make(source.QueryResultsChan, 1)
+		ch <- queryResults
+		grp := source.NewQueryGroup()
+		return source.WithGroup(grp, source.NewFuture(source.DecodeGPUDeviceMetricResult, ch))
+	}
+	metricResult := func(uuid string, value float64) *source.GPUDeviceMetricResult {
+		return &source.GPUDeviceMetricResult{UUID: uuid, Data: []*util.Vector{{Value: value}}}
+	}
+
+	deviceMap := saturationDeviceMap()
+	futures := &dcgmDeviceMetricFutures{
+		powerAvg:      makeFuture(metricResult("GPU-1", 140)),
+		tempAvg:       makeFuture(metricResult("GPU-1", 55)),
+		usageAvg:      makeFuture(metricResult("GPU-1", 0.425)),
+		usageMax:      makeFuture(metricResult("GPU-1", 0.97)),
+		memoryUsedAvg: makeFuture(metricResult("GPU-1", 1024)), // MiB
+		memoryUsedMax: makeFuture(metricResult("GPU-1", 2048)),
+	}
+
+	futures.awaitAndApply(deviceMap)
+
+	d := deviceMap["GPU-1"]
+	if d.PowerWatts != 140 || d.TemperatureCelsius != 55 {
+		t.Errorf("power/temp = (%v, %v), want (140, 55)", d.PowerWatts, d.TemperatureCelsius)
+	}
+	// GR_ENGINE_ACTIVE ratio scaled to percent
+	if d.ComputeUtilizationAvg != 42.5 || d.ComputeUtilizationMax != 97 {
+		t.Errorf("compute util = (%v, %v), want (42.5, 97)", d.ComputeUtilizationAvg, d.ComputeUtilizationMax)
+	}
+	// FB_USED MiB scaled to bytes
+	if d.MemoryUsedBytesAvg != 1024*1024*1024 || d.MemoryUsedBytesMax != 2048*1024*1024 {
+		t.Errorf("memory bytes = (%v, %v)", d.MemoryUsedBytesAvg, d.MemoryUsedBytesMax)
+	}
+
+	// no series targeted GPU-2: untouched zeros
+	if deviceMap["GPU-2"].PowerWatts != 0 || deviceMap["GPU-2"].ComputeUtilizationAvg != 0 {
+		t.Errorf("GPU-2 must stay zero: %+v", deviceMap["GPU-2"])
+	}
+
+	// nil bundle is a no-op
+	var disabled *dcgmDeviceMetricFutures
+	disabled.awaitAndApply(deviceMap)
+}

+ 15 - 0
pkg/kubemodel/kubemodel.go

@@ -9,6 +9,7 @@ import (
 	"github.com/opencost/opencost/core/pkg/model/kubemodel"
 	"github.com/opencost/opencost/core/pkg/model/kubemodel"
 	"github.com/opencost/opencost/core/pkg/model/shared"
 	"github.com/opencost/opencost/core/pkg/model/shared"
 	"github.com/opencost/opencost/core/pkg/source"
 	"github.com/opencost/opencost/core/pkg/source"
+	"github.com/opencost/opencost/pkg/env"
 )
 )
 
 
 const logTimeFmt string = "2006-01-02T15:04:05"
 const logTimeFmt string = "2006-01-02T15:04:05"
@@ -1517,6 +1518,13 @@ func (km *KubeModel) computeDCGMDevices(kms *kubemodel.KubeModelSet, start, end
 	dcgmUsageAvgFuture := source.WithGroup(grp, metrics.QueryDCGMContainerUsageAvg(start, end))
 	dcgmUsageAvgFuture := source.WithGroup(grp, metrics.QueryDCGMContainerUsageAvg(start, end))
 	dcgmUsageMaxFuture := source.WithGroup(grp, metrics.QueryDCGMContainerUsageMax(start, end))
 	dcgmUsageMaxFuture := source.WithGroup(grp, metrics.QueryDCGMContainerUsageMax(start, end))
 
 
+	var saturationFutures *dcgmSaturationFutures
+	if env.IsGPUSaturationMetricsEnabled() {
+		saturationFutures = startDCGMSaturationQueries(grp, metrics, start, end)
+	}
+
+	deviceMetricFutures := startDCGMDeviceMetricQueries(grp, metrics, start, end)
+
 	deviceMap := make(map[string]*kubemodel.DCGMDevice)
 	deviceMap := make(map[string]*kubemodel.DCGMDevice)
 
 
 	dcgmInfoResult, _ := dcgmInfoFuture.Await()
 	dcgmInfoResult, _ := dcgmInfoFuture.Await()
@@ -1579,6 +1587,13 @@ func (km *KubeModel) computeDCGMDevices(kms *kubemodel.KubeModelSet, start, end
 		device.PodUsages[res.PodUID] = pod
 		device.PodUsages[res.PodUID] = pod
 	}
 	}
 
 
+	// reduce container-attributed saturation series onto each device; nil
+	// when the feature is disabled
+	saturationFutures.awaitAndApply(deviceMap)
+
+	// device-level power/temperature/utilization/memory
+	deviceMetricFutures.awaitAndApply(deviceMap)
+
 	for _, device := range deviceMap {
 	for _, device := range deviceMap {
 		if err := kms.RegisterDCGMDevice(device); err != nil {
 		if err := kms.RegisterDCGMDevice(device); err != nil {
 			log.Warnf("Failed to register DCGM device: %s", err.Error())
 			log.Warnf("Failed to register DCGM device: %s", err.Error())