il y a 1 an · 16a1e9946e
--- a/pkg/costmodel/costmodel.go
+++ b/pkg/costmodel/costmodel.go
@@ -988,15 +988,6 @@ func (cm *CostModel) GetNodeCost(cp costAnalyzerCloud.Provider) (map[string]*cos
 
				 	nodeList := cm.Cache.GetAllNodes()
			
 
				 	nodes := make(map[string]*costAnalyzerCloud.Node)
			
 
				 
			
 
				-	vgpuCount, err := getAllocatableVGPUs(cm.Cache)
			
 
				-	if err != nil {
			
 
				-		return nil, err
			
 
				-	}
			
 
				-	vgpuCoeff := 10.0
			
 
				-	if vgpuCount > 0.0 {
			
 
				-		vgpuCoeff = vgpuCount
			
 
				-	}
			
 
				-
			
 
				 	pmd := &costAnalyzerCloud.PricingMatchMetadata{
			
 
				 		TotalNodes:        0,
			
 
				 		PricingTypeCounts: make(map[costAnalyzerCloud.PricingType]int),
			
@@ -1028,6 +1019,8 @@ func (cm *CostModel) GetNodeCost(cp costAnalyzerCloud.Provider) (map[string]*cos
 
				 			pmd.PricingTypeCounts[cnode.PricingType] = 1
			
 
				 		}
			
 
				 
			
 
				+		// newCnode builds upon cnode but populates/overrides certain fields.
			
 
				+		// cnode was populated leveraging cloud provider public pricing APIs.
			
 
				 		newCnode := *cnode
			
 
				 		if newCnode.InstanceType == "" {
			
 
				 			it, _ := util.GetInstanceType(n.Labels)
			
@@ -1070,48 +1063,24 @@ func (cm *CostModel) GetNodeCost(cp costAnalyzerCloud.Provider) (map[string]*cos
 
				 
			
 
				 		newCnode.RAMBytes = fmt.Sprintf("%f", ram)
			
 
				 
			
 
				-		// Azure does not seem to provide a GPU count in its pricing API. GKE supports attaching multiple GPUs
			
 
				-		// So the k8s api will often report more accurate results for GPU count under status > capacity > nvidia.com/gpu than the cloud providers billing data
			
 
				-		// not all providers are guaranteed to use this, so don't overwrite a Provider assignment if we can't find something under that capacity exists
			
 
				-		gpuc := 0.0
			
 
				-		q, ok := n.Status.Capacity["nvidia.com/gpu"]
			
 
				-		_, hasReplicas := n.Labels["nvidia.com/gpu.replicas"]
			
 
				-
			
 
				-		if ok && !hasReplicas {
			
 
				-			gpuCount := q.Value()
			
 
				-			if gpuCount != 0 {
			
 
				-				newCnode.GPU = fmt.Sprintf("%d", gpuCount)
			
 
				-				newCnode.VGPU = newCnode.GPU
			
 
				-				gpuc = float64(gpuCount)
			
 
				-			}
			
 
				-		} else if hasReplicas { // See https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html
			
 
				-			if q.Value() == 0 {
			
 
				-				q = n.Status.Capacity["nvidia.com/gpu.shared"]
			
 
				-			}
			
 
				-			g, ok := n.Labels["nvidia.com/gpu.count"]
			
 
				-			if ok {
			
 
				-				newCnode.GPU = g
			
 
				-			} else {
			
 
				-				newCnode.GPU = fmt.Sprintf("%d", 0)
			
 
				-			}
			
 
				-			newCnode.VGPU = fmt.Sprintf("%d", q.Value())
			
 
				+		gpuc, err := strconv.ParseFloat(newCnode.GPU, 64)
			
 
				+		if err != nil {
			
 
				+			gpuc = 0.0
			
 
				+		}
			
 
				 
			
 
				-		} else if g, ok := n.Status.Capacity["k8s.amazonaws.com/vgpu"]; ok {
			
 
				-			gpuCount := g.Value()
			
 
				-			if gpuCount != 0 {
			
 
				-				newCnode.GPU = fmt.Sprintf("%d", int(float64(gpuCount)/vgpuCoeff))
			
 
				-				newCnode.VGPU = fmt.Sprintf("%d", gpuCount)
			
 
				-				gpuc = float64(gpuCount) / vgpuCoeff
			
 
				-			}
			
 
				-		} else {
			
 
				-			gpuc, err = strconv.ParseFloat(newCnode.GPU, 64)
			
 
				-			if err != nil {
			
 
				-				gpuc = 0.0
			
 
				-			}
			
 
				+		// The k8s API will often report more accurate results for GPU count
			
 
				+		// than cloud provider public pricing APIs. If found, override the
			
 
				+		// original value.
			
 
				+		gpuOverride, vgpuOverride, err := getGPUCount(cm.Cache, n)
			
 
				+		if err != nil {
			
 
				+			log.Warnf("Unable to get GPUCount for node %s: %s", n.Name, err.Error())
			
 
				 		}
			
 
				-		if math.IsNaN(gpuc) {
			
 
				-			log.Warnf("gpu count parsed as NaN. Setting to 0.")
			
 
				-			gpuc = 0.0
			
 
				+		if gpuOverride > 0 {
			
 
				+			newCnode.GPU = fmt.Sprintf("%f", gpuOverride)
			
 
				+			gpuc = gpuOverride
			
 
				+		}
			
 
				+		if vgpuOverride > 0 {
			
 
				+			newCnode.VGPU = fmt.Sprintf("%f", vgpuOverride)
			
 
				 		}
			
 
				 
			
 
				 		// Special case for SUSE rancher, since it won't behave with normal
			
@@ -2358,6 +2327,68 @@ func getStatefulSetsOfPod(pod v1.Pod) []string {
 
				 	return []string{}
			
 
				 }
			
 
				 
			
 
				+// getGPUCount reads the node's Status and Labels (via the k8s API) to identify
			
 
				+// the number of GPUs and vGPUs are equipped on the node. If unable to identify
			
 
				+// a GPU count, it will return -1.
			
 
				+func getGPUCount(cache clustercache.ClusterCache, n *v1.Node) (float64, float64, error) {
			
 
				+	g, hasGpu := n.Status.Capacity["nvidia.com/gpu"]
			
 
				+	_, hasReplicas := n.Labels["nvidia.com/gpu.replicas"]
			
 
				+
			
 
				+	// Case 1: Standard NVIDIA GPU
			
 
				+	if hasGpu && g.Value() != 0 && !hasReplicas {
			
 
				+		return float64(g.Value()), float64(g.Value()), nil
			
 
				+	}
			
 
				+
			
 
				+	// Case 2: NVIDIA GPU with GPU Feature Discovery (GFD) Pod enabled.
			
 
				+	// Ref: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html#verifying-the-gpu-time-slicing-configuration
			
 
				+	// Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L44-L45
			
 
				+	// Ref: https://github.com/NVIDIA/k8s-device-plugin/blob/d899752a424818428f744a946d32b132ea2c0cf1/internal/lm/resource_test.go#L103-L118
			
 
				+	if hasReplicas {
			
 
				+		resultGPU := 0.0
			
 
				+		resultVGPU := 0.0
			
 
				+
			
 
				+		if c, ok := n.Labels["nvidia.com/gpu.count"]; ok {
			
 
				+			var err error
			
 
				+			resultGPU, err = strconv.ParseFloat(c, 64)
			
 
				+			if err != nil {
			
 
				+				return -1, -1, fmt.Errorf("could not parse label \"nvidia.com/gpu.count\": %v", err)
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if s, ok := n.Status.Capacity["nvidia.com/gpu.shared"]; ok { // GFD configured `renameByDefault=true`
			
 
				+			resultVGPU = float64(s.Value())
			
 
				+		} else if g, ok := n.Status.Capacity["nvidia.com/gpu"]; ok { // GFD configured `renameByDefault=false`
			
 
				+			resultVGPU = float64(g.Value())
			
 
				+		} else {
			
 
				+			resultVGPU = resultGPU
			
 
				+		}
			
 
				+
			
 
				+		return resultGPU, resultVGPU, nil
			
 
				+	}
			
 
				+
			
 
				+	// Case 3: AWS vGPU
			
 
				+	if vgpu, ok := n.Status.Capacity["k8s.amazonaws.com/vgpu"]; ok {
			
 
				+		vgpuCount, err := getAllocatableVGPUs(cache)
			
 
				+		if err != nil {
			
 
				+			return -1, -1, err
			
 
				+		}
			
 
				+
			
 
				+		vgpuCoeff := 10.0
			
 
				+		if vgpuCount > 0.0 {
			
 
				+			vgpuCoeff = vgpuCount
			
 
				+		}
			
 
				+
			
 
				+		if vgpu.Value() != 0 {
			
 
				+			resultGPU := float64(vgpu.Value()) / vgpuCoeff
			
 
				+			resultVGPU := float64(vgpu.Value())
			
 
				+			return resultGPU, resultVGPU, nil
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// No GPU found
			
 
				+	return -1, -1, nil
			
 
				+}
			
 
				+
			
 
				 func getAllocatableVGPUs(cache clustercache.ClusterCache) (float64, error) {
			
 
				 	daemonsets := cache.GetAllDaemonSets()
			
 
				 	vgpuCount := 0.0
			
--- a/pkg/costmodel/costmodel_test.go
+++ b/pkg/costmodel/costmodel_test.go
@@ -4,8 +4,95 @@ import (
 
				 	"testing"
			
 
				 
			
 
				 	"github.com/opencost/opencost/core/pkg/util"
			
 
				+	"github.com/stretchr/testify/assert"
			
 
				+	v1 "k8s.io/api/core/v1"
			
 
				+	"k8s.io/apimachinery/pkg/api/resource"
			
 
				+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
			
 
				 )
			
 
				 
			
 
				+func TestGetGPUCount(t *testing.T) {
			
 
				+	tests := []struct {
			
 
				+		name          string
			
 
				+		node          *v1.Node
			
 
				+		expectedGPU   float64
			
 
				+		expectedVGPU  float64
			
 
				+		expectedError bool
			
 
				+	}{
			
 
				+		{
			
 
				+			name: "Standard NVIDIA GPU",
			
 
				+			node: &v1.Node{
			
 
				+				Status: v1.NodeStatus{
			
 
				+					Capacity: v1.ResourceList{
			
 
				+						"nvidia.com/gpu": resource.MustParse("2"),
			
 
				+					},
			
 
				+				},
			
 
				+			},
			
 
				+			expectedGPU:  2.0,
			
 
				+			expectedVGPU: 2.0,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "NVIDIA GPU with GFD - renameByDefault=true",
			
 
				+			node: &v1.Node{
			
 
				+				ObjectMeta: metav1.ObjectMeta{
			
 
				+					Labels: map[string]string{
			
 
				+						"nvidia.com/gpu.replicas": "4",
			
 
				+						"nvidia.com/gpu.count":    "1",
			
 
				+					},
			
 
				+				},
			
 
				+				Status: v1.NodeStatus{
			
 
				+					Capacity: v1.ResourceList{
			
 
				+						"nvidia.com/gpu.shared": resource.MustParse("4"),
			
 
				+					},
			
 
				+				},
			
 
				+			},
			
 
				+			expectedGPU:  1.0,
			
 
				+			expectedVGPU: 4.0,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "NVIDIA GPU with GFD - renameByDefault=false",
			
 
				+			node: &v1.Node{
			
 
				+				ObjectMeta: metav1.ObjectMeta{
			
 
				+					Labels: map[string]string{
			
 
				+						"nvidia.com/gpu.replicas": "4",
			
 
				+						"nvidia.com/gpu.count":    "1",
			
 
				+					},
			
 
				+				},
			
 
				+				Status: v1.NodeStatus{
			
 
				+					Capacity: v1.ResourceList{
			
 
				+						"nvidia.com/gpu": resource.MustParse("4"),
			
 
				+					},
			
 
				+				},
			
 
				+			},
			
 
				+			expectedGPU:  1.0,
			
 
				+			expectedVGPU: 4.0,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "No GPU",
			
 
				+			node: &v1.Node{
			
 
				+				Status: v1.NodeStatus{
			
 
				+					Capacity: v1.ResourceList{},
			
 
				+				},
			
 
				+			},
			
 
				+			expectedGPU:  -1.0,
			
 
				+			expectedVGPU: -1.0,
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tt := range tests {
			
 
				+		t.Run(tt.name, func(t *testing.T) {
			
 
				+			gpu, vgpu, err := getGPUCount(nil, tt.node)
			
 
				+
			
 
				+			if tt.expectedError {
			
 
				+				assert.Error(t, err)
			
 
				+			} else {
			
 
				+				assert.NoError(t, err)
			
 
				+				assert.Equal(t, tt.expectedGPU, gpu)
			
 
				+				assert.Equal(t, tt.expectedVGPU, vgpu)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 func Test_CostData_GetController_CronJob(t *testing.T) {
			
 
				 	cases := []struct {
			
 
				 		name string