2 yıl önce · 13cb87c20c
--- a/core/pkg/log/log.go
+++ b/core/pkg/log/log.go
@@ -126,6 +126,14 @@ func Debugf(format string, a ...interface{}) {
 
				 	log.Debug().Msgf(format, a...)
			
 
				 }
			
 
				 
			
 
				+func Trace(msg string) {
			
 
				+	log.Trace().Msg(msg)
			
 
				+}
			
 
				+
			
 
				+func Tracef(format string, a ...interface{}) {
			
 
				+	log.Trace().Msgf(format, a...)
			
 
				+}
			
 
				+
			
 
				 func Fatalf(format string, a ...interface{}) {
			
 
				 	log.Fatal().Msgf(format, a...)
			
 
				 }
			
--- a/pkg/cloud/models/models.go
+++ b/pkg/cloud/models/models.go
@@ -56,6 +56,7 @@ type Node struct {
 
				 	GPU              string                `json:"gpu"` // GPU represents the number of GPU on the instance
			
 
				 	GPUName          string                `json:"gpuName"`
			
 
				 	GPUCost          string                `json:"gpuCost"`
			
 
				+	VGPU             string                `json:"vgpu"` // virtualized gpu-- if we are using gpu replicas
			
 
				 	InstanceType     string                `json:"instanceType,omitempty"`
			
 
				 	Region           string                `json:"region,omitempty"`
			
 
				 	Reserved         *ReservedInstanceData `json:"reserved,omitempty"`
			
--- a/pkg/costmodel/costmodel.go
+++ b/pkg/costmodel/costmodel.go
@@ -317,17 +317,6 @@ func (cm *CostModel) ComputeCostData(cli prometheusClient.Client, cp costAnalyze
 
				 		log.Warnf("ComputeCostData: continuing despite error parsing normalization values from %s: %s", queryNormalization, err.Error())
			
 
				 	}
			
 
				 
			
 
				-	// Determine if there are vgpus configured and if so get the total allocatable number
			
 
				-	// If there are no vgpus, the coefficient is set to 1.0
			
 
				-	vgpuCount, err := getAllocatableVGPUs(cm.Cache)
			
 
				-	if err != nil {
			
 
				-		log.Warnf("getAllocatableVGCPUs error: %s", err.Error())
			
 
				-	}
			
 
				-	vgpuCoeff := 10.0
			
 
				-	if vgpuCount > 0.0 {
			
 
				-		vgpuCoeff = vgpuCount
			
 
				-	}
			
 
				-
			
 
				 	nodes, err := cm.GetNodeCost(cp)
			
 
				 	if err != nil {
			
 
				 		log.Warnf("GetNodeCost: no node cost model available: " + err.Error())
			
@@ -515,10 +504,9 @@ func (cm *CostModel) ComputeCostData(cli prometheusClient.Client, cp costAnalyze
 
				 				} else if g, ok := container.Resources.Limits["nvidia.com/gpu"]; ok {
			
 
				 					gpuReqCount = g.AsApproximateFloat64()
			
 
				 				} else if g, ok := container.Resources.Requests["k8s.amazonaws.com/vgpu"]; ok {
			
 
				-					// divide vgpu request/limits by total vgpus to get the portion of physical gpus requested
			
 
				-					gpuReqCount = g.AsApproximateFloat64() / vgpuCoeff
			
 
				+					gpuReqCount = g.AsApproximateFloat64()
			
 
				 				} else if g, ok := container.Resources.Limits["k8s.amazonaws.com/vgpu"]; ok {
			
 
				-					gpuReqCount = g.AsApproximateFloat64() / vgpuCoeff
			
 
				+					gpuReqCount = g.AsApproximateFloat64()
			
 
				 				}
			
 
				 				GPUReqV := []*util.Vector{
			
 
				 					{
			
@@ -1092,16 +1080,32 @@ func (cm *CostModel) GetNodeCost(cp costAnalyzerCloud.Provider) (map[string]*cos
 
				 		// not all providers are guaranteed to use this, so don't overwrite a Provider assignment if we can't find something under that capacity exists
			
 
				 		gpuc := 0.0
			
 
				 		q, ok := n.Status.Capacity["nvidia.com/gpu"]
			
 
				-		if ok {
			
 
				+		_, hasReplicas := n.Labels["nvidia.com/gpu.replicas"]
			
 
				+
			
 
				+		if ok && !hasReplicas {
			
 
				 			gpuCount := q.Value()
			
 
				 			if gpuCount != 0 {
			
 
				 				newCnode.GPU = fmt.Sprintf("%d", gpuCount)
			
 
				+				newCnode.VGPU = newCnode.GPU
			
 
				 				gpuc = float64(gpuCount)
			
 
				 			}
			
 
				+		} else if hasReplicas { // See https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html
			
 
				+			if q.Value() == 0 {
			
 
				+				q = n.Status.Capacity["nvidia.com/gpu.shared"]
			
 
				+			}
			
 
				+			g, ok := n.Labels["nvidia.com/gpu.count"]
			
 
				+			if ok {
			
 
				+				newCnode.GPU = g
			
 
				+			} else {
			
 
				+				newCnode.GPU = fmt.Sprintf("%d", 0)
			
 
				+			}
			
 
				+			newCnode.VGPU = fmt.Sprintf("%d", q.Value())
			
 
				+
			
 
				 		} else if g, ok := n.Status.Capacity["k8s.amazonaws.com/vgpu"]; ok {
			
 
				 			gpuCount := g.Value()
			
 
				 			if gpuCount != 0 {
			
 
				 				newCnode.GPU = fmt.Sprintf("%d", int(float64(gpuCount)/vgpuCoeff))
			
 
				+				newCnode.VGPU = fmt.Sprintf("%d", gpuCount)
			
 
				 				gpuc = float64(gpuCount) / vgpuCoeff
			
 
				 			}
			
 
				 		} else {
			
--- a/pkg/costmodel/metrics.go
+++ b/pkg/costmodel/metrics.go
@@ -618,7 +618,15 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 
				 				}
			
 
				 				if len(costs.GPUReq) > 0 {
			
 
				 					// allocation here is set to the request because shared GPU usage not yet supported.
			
 
				-					cmme.GPUAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(costs.GPUReq[0].Value)
			
 
				+					// if VPGUs, request x (actual/virtual)
			
 
				+					vgpu, verr := strconv.ParseFloat(nodes[nodeName].VGPU, 64)
			
 
				+					gpu, err := strconv.ParseFloat(nodes[nodeName].GPU, 64)
			
 
				+					gpualloc := costs.GPUReq[0].Value
			
 
				+					if verr != nil && err != nil && vgpu != 0 {
			
 
				+						gpualloc = gpualloc * (gpu / vgpu)
			
 
				+					}
			
 
				+
			
 
				+					cmme.GPUAllocationRecorder.WithLabelValues(namespace, podName, containerName, nodeName, nodeName).Set(gpualloc)
			
 
				 				}
			
 
				 				labelKey := getKeyFromLabelStrings(namespace, podName, containerName, nodeName, nodeName)
			
 
				 				if podStatus[podName] == v1.PodRunning { // Only report data for current pods