akillibulut
/
cost-model
mirror da https://github.com/kubecost/cost-model


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
							package kubemodel

import "time"

// Accelerator devices are modeled as per-vendor concrete types (e.g.
// DCGMDevice for NVIDIA) behind small interfaces rather than one generic
// concrete struct. A generic type forces every vendor's changes into shared
// code, implies relational fields not all vendors can provide, and blurs
// semantics that differ between vendors (e.g. what "power" means for AMD vs
// NVIDIA). Overly specific shared models have the opposite failure:
// collecting more than is used and looking half-supported for vendors
// missing a feature. Vendor structs keep agent-side transformation minimal
// (data ships close to its source shape, e.g. DCGM for NVIDIA), while these
// interfaces provide the common surface for identification, utilization,
// performance, and saturation.
//
// Capacity is intentionally absent: it is not readily available from vendor
// telemetry (e.g. DCGM) and is inferred via device plugins / DRA, which are
// assumed prerequisites and provide the request half of device allocation.
// Fan speed, PCIe topology detail, and topology are out of scope.

// DeviceInfo identifies a physical or virtualized accelerator device and
// its lifecycle within the observed window.
type DeviceInfo interface {
	// GetIdentifier returns the device UUID. For virtualized devices
	// (e.g. a MIG instance) this is the UUID of the virtual device.
	GetIdentifier() string

	// GetType returns the human-readable device type, e.g. "GPU".
	GetType() string

	// GetName returns the marketing/model name, e.g. "NVIDIA A100 80GB".
	GetName() string

	// GetPower returns the device power draw in watts.
	GetPower() float64

	// GetStart returns the time the device came online in the observed
	// window.
	GetStart() time.Time

	// GetEnd returns the time the device's observation ended.
	GetEnd() time.Time

	// GetParent returns the UUID of the parent device, or empty if none.
	// For a MIG instance this is the UUID of the physical GPU.
	GetParent() string
}

// DevicePerformance exposes utilization and performance measures for an
// accelerator device.
type DevicePerformance interface {
	// GetComputeUtilizationAverage returns the average percentage of
	// compute used over the observed window.
	GetComputeUtilizationAverage() float64

	// GetComputeUtilizationMax returns the max percentage of compute used.
	GetComputeUtilizationMax() float64

	// GetMemoryUtilizationAverage returns the average absolute memory used
	// in bytes.
	GetMemoryUtilizationAverage() float64

	// GetMemoryUtilizationMax returns the max absolute memory used in
	// bytes.
	GetMemoryUtilizationMax() float64

	// GetTemp returns the current device temperature in degrees Celsius.
	GetTemp() float64
}

// DeviceSaturation exposes USE-method saturation measures for an
// accelerator device: where DevicePerformance reports how busy the device
// was, saturation reports work queued, rejected, or slowed because the
// device could not service demand.
//
// Absence is meaningful and distinct from zero: scalar getters return
// ok=false and map getters return nil when the vendor telemetry for that
// signal was unavailable, so consumers never mistake "not measured" for
// "not saturated". Names are vendor-neutral; the mapping to vendor
// telemetry (e.g. DCGM fields for NVIDIA) lives in each implementation.
// Ratios are fractions of the observed window in [0, 1].
type DeviceSaturation interface {
	// GetThrottleViolationRatios returns the fraction of the window the
	// device spent throttled, keyed by vendor-defined reason name, from
	// cumulative time-in-violation counters. Nil when unavailable.
	GetThrottleViolationRatios() map[string]float64

	// GetThrottleReasonRatios returns the fraction of the window each
	// vendor-defined throttle reason state was active, from sampled state
	// flags. Nil when unavailable.
	GetThrottleReasonRatios() map[string]float64

	// GetMemoryUsedRatioAvg returns average device-memory occupancy.
	GetMemoryUsedRatioAvg() (float64, bool)

	// GetMemoryUsedRatioMax returns peak device-memory occupancy.
	GetMemoryUsedRatioMax() (float64, bool)

	// GetMemoryPressureRatio returns the fraction of the window
	// device-memory occupancy exceeded the configured threshold.
	GetMemoryPressureRatio() (float64, bool)

	// GetErrorEventCount returns the number of device error events
	// (rejected work) observed in the window.
	GetErrorEventCount() (float64, bool)

	// GetMemoryBandwidthActiveAvg returns the average ratio of cycles the
	// device memory interface was active.
	GetMemoryBandwidthActiveAvg() (float64, bool)

	// GetMemoryBandwidthActiveMax returns the peak ratio of cycles the
	// device memory interface was active.
	GetMemoryBandwidthActiveMax() (float64, bool)

	// GetComputeActiveAvg returns the average ratio of cycles compute
	// units had work resident. Together with GetComputeOccupancyAvg and
	// GetMemoryBandwidthActiveAvg, consumers can distinguish
	// compute-bound from bandwidth- or latency-bound saturation.
	GetComputeActiveAvg() (float64, bool)

	// GetComputeOccupancyAvg returns the average ratio of resident work
	// to the compute units' maximum.
	GetComputeOccupancyAvg() (float64, bool)

	// GetHostLinkTxBytesAvg/RxBytesAvg return average host-interconnect
	// throughput (e.g. PCIe) in bytes/sec. Link capacity is not derivable
	// from vendor telemetry, so these are rates, not ratios.
	GetHostLinkTxBytesAvg() (float64, bool)
	GetHostLinkRxBytesAvg() (float64, bool)

	// GetPeerLinkTxBytesAvg/RxBytesAvg return average device-to-device
	// fabric throughput (e.g. NVLink) in bytes/sec.
	GetPeerLinkTxBytesAvg() (float64, bool)
	GetPeerLinkRxBytesAvg() (float64, bool)
}