|
|
@@ -100,9 +100,64 @@ type Allocation struct {
|
|
|
// UnmountedPVCost is used to track how much of the cost in PVs is for an
|
|
|
// unmounted PV. It is not additive of PVCost() and need not be sent in API
|
|
|
// responses.
|
|
|
- UnmountedPVCost float64 `json:"-"` //@bingen:field[ignore]
|
|
|
- GPURequestAverage float64 `json:"gpuRequestAverage"` //@bingen:field[version=22]
|
|
|
- GPUUsageAverage float64 `json:"gpuUsageAverage"` //@bingen:field[version=22]
|
|
|
+ UnmountedPVCost float64 `json:"-"` //@bingen:field[ignore]
|
|
|
+ deprecatedGPURequestAverage float64 `json:"-"` //@bingen:field[version=22]
|
|
|
+ deprecatedGPUUsageAverage float64 `json:"-"` //@bingen:field[version=22]
|
|
|
+ GPUAllocation *GPUAllocation `json:"GPUAllocation"` //@bingen:field[version=23]
|
|
|
+}
|
|
|
+
|
|
|
+type GPUAllocation struct {
|
|
|
+ GPUDevice string `json:"gpuDevice,omitempty"`
|
|
|
+ GPUModel string `json:"gpuModel,omitempty"`
|
|
|
+ GPUUUID string `json:"gpuUUID,omitempty"`
|
|
|
+
|
|
|
+ IsGPUShared *bool `json:"isGPUShared"`
|
|
|
+ GPUUsageAverage *float64 `json:"gpuUsageAverage"`
|
|
|
+ GPURequestAverage *float64 `json:"gpuRequestAverage"`
|
|
|
+}
|
|
|
+
|
|
|
+func (orig *GPUAllocation) SanitizeNaN() {
|
|
|
+ if orig == nil {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if orig.GPURequestAverage == nil || math.IsNaN(*orig.GPURequestAverage) {
|
|
|
+ orig.GPURequestAverage = nil
|
|
|
+ }
|
|
|
+ if orig.GPUUsageAverage == nil || math.IsNaN(*orig.GPUUsageAverage) {
|
|
|
+ orig.GPUUsageAverage = nil
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func (orig *GPUAllocation) Clone() *GPUAllocation {
|
|
|
+ if orig == nil {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+
|
|
|
+ return &GPUAllocation{
|
|
|
+ GPUDevice: orig.GPUDevice,
|
|
|
+ GPUModel: orig.GPUModel,
|
|
|
+ GPUUUID: orig.GPUUUID,
|
|
|
+ IsGPUShared: orig.IsGPUShared,
|
|
|
+ GPUUsageAverage: orig.GPUUsageAverage,
|
|
|
+ GPURequestAverage: orig.GPURequestAverage,
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func (orig *GPUAllocation) Equal(that *GPUAllocation) bool {
|
|
|
+ if orig == nil && that == nil {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ if orig == nil || that == nil {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+
|
|
|
+ return orig.GPUDevice == that.GPUDevice &&
|
|
|
+ orig.GPUModel == that.GPUModel &&
|
|
|
+ orig.GPUUUID == that.GPUUUID &&
|
|
|
+ orig.IsGPUShared == that.IsGPUShared &&
|
|
|
+ orig.GPUUsageAverage == that.GPUUsageAverage &&
|
|
|
+ orig.GPURequestAverage == that.GPURequestAverage
|
|
|
+
|
|
|
}
|
|
|
|
|
|
type LbAllocations map[string]*LbAllocation
|
|
|
@@ -174,8 +229,9 @@ func (lba *LbAllocation) SanitizeNaN() {
|
|
|
// then this type would be unnecessary and its fields would go into the regular Allocation
|
|
|
// and not in the AggregatedAllocation.
|
|
|
type RawAllocationOnlyData struct {
|
|
|
- CPUCoreUsageMax float64 `json:"cpuCoreUsageMax"`
|
|
|
- RAMBytesUsageMax float64 `json:"ramByteUsageMax"`
|
|
|
+ CPUCoreUsageMax float64 `json:"cpuCoreUsageMax"`
|
|
|
+ RAMBytesUsageMax float64 `json:"ramByteUsageMax"`
|
|
|
+ GPUUsageMax *float64 `json:"gpuUsageMax"` //@bingen:field[version=23]
|
|
|
}
|
|
|
|
|
|
// Clone returns a deep copy of the given RawAllocationOnlyData
|
|
|
@@ -187,6 +243,7 @@ func (r *RawAllocationOnlyData) Clone() *RawAllocationOnlyData {
|
|
|
return &RawAllocationOnlyData{
|
|
|
CPUCoreUsageMax: r.CPUCoreUsageMax,
|
|
|
RAMBytesUsageMax: r.RAMBytesUsageMax,
|
|
|
+ GPUUsageMax: r.GPUUsageMax,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -198,8 +255,16 @@ func (r *RawAllocationOnlyData) Equal(that *RawAllocationOnlyData) bool {
|
|
|
if r == nil || that == nil {
|
|
|
return false
|
|
|
}
|
|
|
- return util.IsApproximately(r.CPUCoreUsageMax, that.CPUCoreUsageMax) &&
|
|
|
+ cmpResult := util.IsApproximately(r.CPUCoreUsageMax, that.CPUCoreUsageMax) &&
|
|
|
util.IsApproximately(r.RAMBytesUsageMax, that.RAMBytesUsageMax)
|
|
|
+
|
|
|
+ if r.GPUUsageMax != nil && that.GPUUsageMax != nil {
|
|
|
+ cmpResult = cmpResult && util.IsApproximately(*r.GPUUsageMax, *that.GPUUsageMax)
|
|
|
+ } else if !(r.GPUUsageMax == nil && that.GPUUsageMax == nil) {
|
|
|
+ cmpResult = false
|
|
|
+ }
|
|
|
+
|
|
|
+ return cmpResult
|
|
|
}
|
|
|
|
|
|
func (r *RawAllocationOnlyData) SanitizeNaN() {
|
|
|
@@ -214,6 +279,10 @@ func (r *RawAllocationOnlyData) SanitizeNaN() {
|
|
|
log.DedupedWarningf(5, "RawAllocationOnlyData: Unexpected NaN found for RAMBytesUsageMax")
|
|
|
r.RAMBytesUsageMax = 0
|
|
|
}
|
|
|
+ if r.GPUUsageMax == nil || math.IsNaN(*r.GPUUsageMax) {
|
|
|
+ log.DedupedWarningf(5, "RawAllocationOnlyData: Unexpected NaN found for GPUUsageMax")
|
|
|
+ r.GPUUsageMax = nil
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
// PVAllocations is a map of Disk Asset Identifiers to the
|
|
|
@@ -675,8 +744,8 @@ func (a *Allocation) Clone() *Allocation {
|
|
|
CPUCostIdle: a.CPUCostIdle,
|
|
|
CPUCostAdjustment: a.CPUCostAdjustment,
|
|
|
GPUHours: a.GPUHours,
|
|
|
- GPURequestAverage: a.GPURequestAverage,
|
|
|
- GPUUsageAverage: a.GPUUsageAverage,
|
|
|
+ deprecatedGPURequestAverage: a.deprecatedGPURequestAverage,
|
|
|
+ deprecatedGPUUsageAverage: a.deprecatedGPUUsageAverage,
|
|
|
GPUCost: a.GPUCost,
|
|
|
GPUCostIdle: a.GPUCostIdle,
|
|
|
GPUCostAdjustment: a.GPUCostAdjustment,
|
|
|
@@ -704,6 +773,7 @@ func (a *Allocation) Clone() *Allocation {
|
|
|
SharedCostBreakdown: a.SharedCostBreakdown.Clone(),
|
|
|
LoadBalancers: a.LoadBalancers.Clone(),
|
|
|
UnmountedPVCost: a.UnmountedPVCost,
|
|
|
+ GPUAllocation: a.GPUAllocation.Clone(),
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -816,6 +886,10 @@ func (a *Allocation) Equal(that *Allocation) bool {
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
+ if !a.GPUAllocation.Equal(that.GPUAllocation) {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+
|
|
|
return true
|
|
|
}
|
|
|
|
|
|
@@ -963,17 +1037,24 @@ func (a *Allocation) RAMEfficiency() float64 {
|
|
|
|
|
|
// GPUEfficiency is the ratio of usage to request. Note that, without the NVIDIA
|
|
|
// DCGM exporter providing Prometheus with usage metrics, this will always be
|
|
|
-// zero, as GPUUsageAverage will be zero (the default value).
|
|
|
+// zero, as deprecatedGPUUsageAverage will be zero (the default value).
|
|
|
func (a *Allocation) GPUEfficiency() float64 {
|
|
|
if a == nil {
|
|
|
return 0.0
|
|
|
}
|
|
|
+ if a.GPUAllocation == nil {
|
|
|
+ return 0.0
|
|
|
+ }
|
|
|
+
|
|
|
+ if a.GPUAllocation.GPURequestAverage == nil || a.GPUAllocation.GPUUsageAverage == nil {
|
|
|
+ return 0.0
|
|
|
+ }
|
|
|
|
|
|
- if a.GPURequestAverage > 0 && a.GPUUsageAverage > 0 {
|
|
|
- return a.GPUUsageAverage / a.GPURequestAverage
|
|
|
+ if *a.GPUAllocation.GPURequestAverage > 0 && *a.GPUAllocation.GPUUsageAverage > 0 {
|
|
|
+ return *a.GPUAllocation.GPUUsageAverage / *a.GPUAllocation.GPURequestAverage
|
|
|
}
|
|
|
|
|
|
- if a.GPUUsageAverage == 0.0 || a.GPUTotalCost() == 0.0 {
|
|
|
+ if *a.GPUAllocation.GPURequestAverage == 0.0 || a.GPUTotalCost() == 0.0 {
|
|
|
return 0.0
|
|
|
}
|
|
|
|
|
|
@@ -1221,11 +1302,37 @@ func (a *Allocation) add(that *Allocation) {
|
|
|
ramUseByteMins := a.RAMBytesUsageAverage * a.Minutes()
|
|
|
ramUseByteMins += that.RAMBytesUsageAverage * that.Minutes()
|
|
|
|
|
|
- gpuReqMins := a.GPURequestAverage * a.Minutes()
|
|
|
- gpuReqMins += that.GPURequestAverage * that.Minutes()
|
|
|
+ var gpuReqMins *float64 = nil
|
|
|
+ if a.GPUAllocation != nil && a.GPUAllocation.GPURequestAverage != nil {
|
|
|
+ result := *a.GPUAllocation.GPURequestAverage * a.Minutes()
|
|
|
+ gpuReqMins = &result
|
|
|
+ }
|
|
|
|
|
|
- gpuUseMins := a.GPUUsageAverage * a.Minutes()
|
|
|
- gpuUseMins += that.GPUUsageAverage * that.Minutes()
|
|
|
+ if that.GPUAllocation != nil && that.GPUAllocation.GPURequestAverage != nil {
|
|
|
+ if gpuReqMins == nil {
|
|
|
+ result := *that.GPUAllocation.GPURequestAverage * that.Minutes()
|
|
|
+ gpuReqMins = &result
|
|
|
+ } else {
|
|
|
+ result := *gpuReqMins + *that.GPUAllocation.GPURequestAverage*that.Minutes()
|
|
|
+ gpuReqMins = &result
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ var gpuUseMins *float64 = nil
|
|
|
+ if a.GPUAllocation != nil && a.GPUAllocation.GPUUsageAverage != nil {
|
|
|
+ result := *a.GPUAllocation.GPUUsageAverage * a.Minutes()
|
|
|
+ gpuUseMins = &result
|
|
|
+ }
|
|
|
+
|
|
|
+ if that.GPUAllocation != nil && that.GPUAllocation.GPUUsageAverage != nil {
|
|
|
+ if gpuUseMins == nil {
|
|
|
+ result := *that.GPUAllocation.GPUUsageAverage * that.Minutes()
|
|
|
+ gpuUseMins = &result
|
|
|
+ } else {
|
|
|
+ result := *gpuUseMins + *that.GPUAllocation.GPUUsageAverage*that.Minutes()
|
|
|
+ gpuUseMins = &result
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
// Expand Start and End to be the "max" of among the given Allocations
|
|
|
if that.Start.Before(a.Start) {
|
|
|
@@ -1242,15 +1349,32 @@ func (a *Allocation) add(that *Allocation) {
|
|
|
a.CPUCoreUsageAverage = cpuUseCoreMins / a.Minutes()
|
|
|
a.RAMBytesRequestAverage = ramReqByteMins / a.Minutes()
|
|
|
a.RAMBytesUsageAverage = ramUseByteMins / a.Minutes()
|
|
|
- a.GPURequestAverage = gpuReqMins / a.Minutes()
|
|
|
- a.GPUUsageAverage = gpuUseMins / a.Minutes()
|
|
|
+
|
|
|
+ if a.GPUAllocation != nil {
|
|
|
+ if gpuReqMins != nil {
|
|
|
+ gpuReqMinsRes := *gpuReqMins / a.Minutes()
|
|
|
+ a.GPUAllocation.GPURequestAverage = &gpuReqMinsRes
|
|
|
+ } else {
|
|
|
+ a.GPUAllocation.GPURequestAverage = nil
|
|
|
+ }
|
|
|
+
|
|
|
+ if gpuUseMins != nil {
|
|
|
+ gpuUsageMinsRes := *gpuUseMins / a.Minutes()
|
|
|
+ a.GPUAllocation.GPUUsageAverage = &gpuUsageMinsRes
|
|
|
+ } else {
|
|
|
+ a.GPUAllocation.GPUUsageAverage = nil
|
|
|
+ }
|
|
|
+ }
|
|
|
} else {
|
|
|
a.CPUCoreRequestAverage = 0.0
|
|
|
a.CPUCoreUsageAverage = 0.0
|
|
|
a.RAMBytesRequestAverage = 0.0
|
|
|
a.RAMBytesUsageAverage = 0.0
|
|
|
- a.GPURequestAverage = 0.0
|
|
|
- a.GPUUsageAverage = 0.0
|
|
|
+
|
|
|
+ if a.GPUAllocation != nil {
|
|
|
+ a.GPUAllocation.GPURequestAverage = nil
|
|
|
+ a.GPUAllocation.GPUUsageAverage = nil
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
// Sum all cumulative resource fields
|
|
|
@@ -1290,6 +1414,21 @@ func (a *Allocation) add(that *Allocation) {
|
|
|
// Sum LoadBalancer Allocations
|
|
|
a.LoadBalancers = a.LoadBalancers.Add(that.LoadBalancers)
|
|
|
|
|
|
+ // Sum GPU Allocations
|
|
|
+ if that.GPUAllocation != nil {
|
|
|
+ if a.GPUAllocation == nil {
|
|
|
+ a.GPUAllocation = that.GPUAllocation.Clone()
|
|
|
+ } else {
|
|
|
+ if a.GPUAllocation.GPUUsageAverage != nil && that.GPUAllocation.GPUUsageAverage != nil {
|
|
|
+ *a.GPUAllocation.GPUUsageAverage += *that.GPUAllocation.GPUUsageAverage
|
|
|
+ }
|
|
|
+
|
|
|
+ if a.GPUAllocation.GPURequestAverage != nil && that.GPUAllocation.GPURequestAverage != nil {
|
|
|
+ *a.GPUAllocation.GPURequestAverage += *that.GPUAllocation.GPURequestAverage
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
// Any data that is in a "raw allocation only" is not valid in any
|
|
|
// sort of cumulative Allocation (like one that is added).
|
|
|
a.RawAllocationOnly = nil
|
|
|
@@ -2587,14 +2726,7 @@ func (a *Allocation) SanitizeNaN() {
|
|
|
log.DedupedWarningf(5, "Allocation: Unexpected NaN found for GPUHours name:%s, window:%s, properties:%s", a.Name, a.Window.String(), a.Properties.String())
|
|
|
a.GPUHours = 0
|
|
|
}
|
|
|
- if math.IsNaN(a.GPURequestAverage) {
|
|
|
- log.DedupedWarningf(5, "Allocation: Unexpected NaN found for GPURequestAverage name:%s, window:%s, properties:%s", a.Name, a.Window.String(), a.Properties.String())
|
|
|
- a.GPURequestAverage = 0
|
|
|
- }
|
|
|
- if math.IsNaN(a.GPUUsageAverage) {
|
|
|
- log.DedupedWarningf(5, "Allocation: Unexpected NaN found for GPUUsageAverage name:%s, window:%s, properties:%s", a.Name, a.Window.String(), a.Properties.String())
|
|
|
- a.GPUUsageAverage = 0
|
|
|
- }
|
|
|
+
|
|
|
if math.IsNaN(a.GPUCost) {
|
|
|
log.DedupedWarningf(5, "Allocation: Unexpected NaN found for GPUCost name:%s, window:%s, properties:%s", a.Name, a.Window.String(), a.Properties.String())
|
|
|
a.GPUCost = 0
|
|
|
@@ -2682,6 +2814,7 @@ func (a *Allocation) SanitizeNaN() {
|
|
|
|
|
|
a.PVs.SanitizeNaN()
|
|
|
a.RawAllocationOnly.SanitizeNaN()
|
|
|
+ a.GPUAllocation.SanitizeNaN()
|
|
|
a.ProportionalAssetResourceCosts.SanitizeNaN()
|
|
|
a.SharedCostBreakdown.SanitizeNaN()
|
|
|
a.LoadBalancers.SanitizeNaN()
|
|
|
@@ -3570,3 +3703,15 @@ func (asr *AllocationSetRange) Clone() *AllocationSetRange {
|
|
|
|
|
|
return sasrClone
|
|
|
}
|
|
|
+
|
|
|
+func migrateAllocation(as *Allocation, fromVersion uint8, toVersion uint8) {
|
|
|
+ if fromVersion == toVersion {
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ if fromVersion == 22 && toVersion >= 23 {
|
|
|
+ as.GPUAllocation = &GPUAllocation{}
|
|
|
+ as.GPUAllocation.GPUUsageAverage = &as.deprecatedGPUUsageAverage
|
|
|
+ as.GPUAllocation.GPURequestAverage = &as.deprecatedGPURequestAverage
|
|
|
+ }
|
|
|
+}
|