Просмотр исходного кода

Fix custom provider GPU default pricing (#3830)

Co-authored-by: Cursor Agent <cursoragent@cursor.com>
Alex Meijer 1 неделя назад
Родитель
Сommit
0580d4561b

+ 28 - 6
pkg/cloud/provider/customprovider.go

@@ -73,6 +73,8 @@ type customProviderKey struct {
 	SpotLabelValue string
 	GPULabel       string
 	GPULabelValue  string
+	GPUTypeName    string
+	GPUCountValue  int
 	Labels         map[string]string
 }
 
@@ -179,8 +181,12 @@ func (cp *CustomProvider) NodePricing(key models.Key) (*models.Node, models.Pric
 		k = "default"
 	}
 	if key.GPUType() != "" {
-		k += ",gpu"    // TODO: support multiple custom gpu types.
-		gpuCount = "1" // TODO: support more than one gpu.
+		k += ",gpu" // TODO: support multiple custom gpu types.
+		if key.GPUCount() > 0 {
+			gpuCount = strconv.Itoa(key.GPUCount())
+		} else {
+			gpuCount = "1"
+		}
 	}
 
 	var cpuCost, ramCost, gpuCost string
@@ -236,11 +242,25 @@ func (cp *CustomProvider) DownloadPricingData() error {
 }
 
 func (cp *CustomProvider) GetKey(labels map[string]string, n *clustercache.Node) models.Key {
+	gpuTypeName := ""
+	gpuCount := 0
+	if n != nil {
+		if gpu, ok := n.Status.Capacity["nvidia.com/gpu"]; ok && gpu.Value() > 0 {
+			gpuTypeName = "nvidia.com/gpu"
+			gpuCount = int(gpu.Value())
+		} else if vgpu, ok := n.Status.Capacity["k8s.amazonaws.com/vgpu"]; ok && vgpu.Value() > 0 {
+			gpuTypeName = "k8s.amazonaws.com/vgpu"
+			gpuCount = int(vgpu.Value())
+		}
+	}
+
 	return &customProviderKey{
 		SpotLabel:      cp.SpotLabel,
 		SpotLabelValue: cp.SpotLabelValue,
 		GPULabel:       cp.GPULabel,
 		GPULabelValue:  cp.GPULabelValue,
+		GPUTypeName:    gpuTypeName,
+		GPUCountValue:  gpuCount,
 		Labels:         labels,
 	}
 }
@@ -375,14 +395,16 @@ func (key *customPVKey) Features() string {
 }
 
 func (k *customProviderKey) GPUCount() int {
-	return 0
+	return k.GPUCountValue
 }
 
 func (cpk *customProviderKey) GPUType() string {
-	if t, ok := cpk.Labels[cpk.GPULabel]; ok {
-		return t
+	if cpk.GPULabel != "" {
+		if t, ok := cpk.Labels[cpk.GPULabel]; ok {
+			return t
+		}
 	}
-	return ""
+	return cpk.GPUTypeName
 }
 
 func (cpk *customProviderKey) ID() string {

+ 163 - 0
pkg/cloud/provider/provider_test.go

@@ -2,6 +2,12 @@ package provider
 
 import (
 	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/opencost/opencost/core/pkg/storage"
+	"github.com/opencost/opencost/pkg/config"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 )
 
 func TestParseLocalDiskID(t *testing.T) {
@@ -42,3 +48,160 @@ func TestParseLocalDiskID(t *testing.T) {
 		})
 	}
 }
+
+func TestProviderConfigUpdateFromMapPreservesHourlyPrices(t *testing.T) {
+	confMan := config.NewConfigFileManager(storage.NewMemoryStorage())
+	providerConfig := NewProviderConfig(confMan, "default.json")
+
+	updated, err := providerConfig.UpdateFromMap(map[string]string{
+		"CPU":     "0.031611",
+		"spotCPU": "0.006655",
+		"RAM":     "0.004237",
+		"spotRAM": "0.000892",
+		"GPU":     "0.95",
+		"spotGPU": "0.308",
+		"storage": "0.00005479452",
+	})
+	if err != nil {
+		t.Fatalf("UpdateFromMap returned error: %v", err)
+	}
+
+	if updated.CPU != "0.031611" {
+		t.Errorf("CPU = %q, want hourly value %q", updated.CPU, "0.031611")
+	}
+	if updated.SpotCPU != "0.006655" {
+		t.Errorf("SpotCPU = %q, want hourly value %q", updated.SpotCPU, "0.006655")
+	}
+	if updated.RAM != "0.004237" {
+		t.Errorf("RAM = %q, want hourly value %q", updated.RAM, "0.004237")
+	}
+	if updated.SpotRAM != "0.000892" {
+		t.Errorf("SpotRAM = %q, want hourly value %q", updated.SpotRAM, "0.000892")
+	}
+	if updated.GPU != "0.95" {
+		t.Errorf("GPU = %q, want hourly value %q", updated.GPU, "0.95")
+	}
+	if updated.SpotGPU != "0.308" {
+		t.Errorf("SpotGPU = %q, want hourly value %q", updated.SpotGPU, "0.308")
+	}
+	if updated.Storage != "0.00005479452" {
+		t.Errorf("Storage = %q, want hourly value %q", updated.Storage, "0.00005479452")
+	}
+}
+
+func TestCustomProviderGetKeyDetectsGPUCapacity(t *testing.T) {
+	cases := []struct {
+		name         string
+		provider     *CustomProvider
+		labels       map[string]string
+		capacity     v1.ResourceList
+		wantGPUType  string
+		wantGPUCount int
+	}{
+		{
+			name: "nvidia GPU capacity",
+			capacity: v1.ResourceList{
+				"nvidia.com/gpu": resource.MustParse("2"),
+			},
+			wantGPUType:  "nvidia.com/gpu",
+			wantGPUCount: 2,
+		},
+		{
+			name: "virtual GPU capacity",
+			capacity: v1.ResourceList{
+				"k8s.amazonaws.com/vgpu": resource.MustParse("3"),
+			},
+			wantGPUType:  "k8s.amazonaws.com/vgpu",
+			wantGPUCount: 3,
+		},
+		{
+			name: "configured GPU label takes precedence over capacity type",
+			provider: &CustomProvider{
+				GPULabel: "gpu.example/type",
+			},
+			labels: map[string]string{
+				"gpu.example/type": "a100",
+			},
+			capacity: v1.ResourceList{
+				"nvidia.com/gpu": resource.MustParse("4"),
+			},
+			wantGPUType:  "a100",
+			wantGPUCount: 4,
+		},
+		{
+			name:         "no GPU capacity",
+			capacity:     v1.ResourceList{},
+			wantGPUType:  "",
+			wantGPUCount: 0,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			customProvider := tt.provider
+			if customProvider == nil {
+				customProvider = &CustomProvider{}
+			}
+			labels := tt.labels
+			if labels == nil {
+				labels = map[string]string{}
+			}
+
+			key := customProvider.GetKey(labels, &clustercache.Node{
+				Labels: labels,
+				Status: v1.NodeStatus{
+					Capacity: tt.capacity,
+				},
+			})
+
+			if got := key.GPUType(); got != tt.wantGPUType {
+				t.Errorf("GPUType() = %q, want %q", got, tt.wantGPUType)
+			}
+			if got := key.GPUCount(); got != tt.wantGPUCount {
+				t.Errorf("GPUCount() = %d, want %d", got, tt.wantGPUCount)
+			}
+		})
+	}
+}
+
+func TestCustomProviderNodePricingUsesDetectedGPUCount(t *testing.T) {
+	customProvider := &CustomProvider{
+		Pricing: map[string]*NodePrice{
+			"default": {
+				CPU: "0.031611",
+				RAM: "0.004237",
+			},
+			"default,gpu": {
+				CPU: "0.031611",
+				RAM: "0.004237",
+				GPU: "0.95",
+			},
+		},
+	}
+
+	key := customProvider.GetKey(map[string]string{}, &clustercache.Node{
+		Status: v1.NodeStatus{
+			Capacity: v1.ResourceList{
+				"nvidia.com/gpu": resource.MustParse("2"),
+			},
+		},
+	})
+
+	node, _, err := customProvider.NodePricing(key)
+	if err != nil {
+		t.Fatalf("NodePricing returned error: %v", err)
+	}
+
+	if node.VCPUCost != "0.031611" {
+		t.Errorf("VCPUCost = %q, want %q", node.VCPUCost, "0.031611")
+	}
+	if node.RAMCost != "0.004237" {
+		t.Errorf("RAMCost = %q, want %q", node.RAMCost, "0.004237")
+	}
+	if node.GPUCost != "0.95" {
+		t.Errorf("GPUCost = %q, want %q", node.GPUCost, "0.95")
+	}
+	if node.GPU != "2" {
+		t.Errorf("GPU = %q, want %q", node.GPU, "2")
+	}
+}

+ 0 - 8
pkg/cloud/provider/providerconfig.go

@@ -4,7 +4,6 @@ import (
 	"fmt"
 	"os"
 	gopath "path"
-	"strconv"
 	"sync"
 
 	coreenv "github.com/opencost/opencost/core/pkg/env"
@@ -191,13 +190,6 @@ func (pc *ProviderConfig) UpdateFromMap(a map[string]string) (*models.CustomPric
 		for k, v := range a {
 			// Just so we consistently supply / receive the same values, uppercase the first letter.
 			kUpper := utils.ToTitle.String(k)
-			if kUpper == "CPU" || kUpper == "SpotCPU" || kUpper == "RAM" || kUpper == "SpotRAM" || kUpper == "GPU" || kUpper == "Storage" {
-				val, err := strconv.ParseFloat(v, 64)
-				if err != nil {
-					return fmt.Errorf("unable to parse CPU from string to float: %s", err.Error())
-				}
-				v = fmt.Sprintf("%f", val/730)
-			}
 
 			err := models.SetCustomPricingField(c, kUpper, v)
 			if err != nil {

+ 8 - 8
pkg/costmodel/cluster_helpers_test.go

@@ -1057,10 +1057,10 @@ func TestAssetCustompricing(t *testing.T) {
 				"customPricesEnabled": "true",
 			},
 			expectedPricing: map[string]float64{
-				"CPU":     0.027397,              // 20.0 / 730
-				"RAM":     5.102716386318207e-12, // 4.0 / 730 / 1024^3
-				"GPU":     1.369864,              // 500.0 / 730 * 2
-				"Storage": 0.000137,              // 0.1 / 730 * (1073741824.0 / 1024 / 1024 / 1024) * (60 / 60) => 0.1 / 730 * 1 * 1
+				"CPU":     20.0,
+				"RAM":     4.0 / 1024.0 / 1024.0 / 1024.0,
+				"GPU":     1000.0, // 500.0 per GPU-hour * 2 GPUs
+				"Storage": 0.1,    // 0.1 per GiB-hour * 1 GiB * 1 hour
 			},
 			zeroCollector: false,
 		},
@@ -1075,10 +1075,10 @@ func TestAssetCustompricing(t *testing.T) {
 				// This tests the fallback behavior when collector returns 0
 			},
 			expectedPricing: map[string]float64{
-				"CPU":     0.027397,              // 20.0 / 730 (fallback from 0)
-				"RAM":     5.102716386318207e-12, // 4.0 / 730 / 1024^3 (fallback from 0)
-				"GPU":     0.0,                   // GPU doesn't have fallback logic
-				"Storage": 1.0,                   // Storage uses separate PV pricing (pvCostPromResult), not affected by node pricing
+				"CPU":     20.0,
+				"RAM":     4.0 / 1024.0 / 1024.0 / 1024.0,
+				"GPU":     0.0, // GPU doesn't have fallback logic
+				"Storage": 1.0, // Storage uses separate PV pricing (pvCostPromResult), not affected by node pricing
 			},
 			zeroCollector: true,
 		},

+ 47 - 0
pkg/costmodel/costmodel_test.go

@@ -8,6 +8,7 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/opencost/opencost/core/pkg/clustercache"
+	coreenv "github.com/opencost/opencost/core/pkg/env"
 	"github.com/opencost/opencost/core/pkg/storage"
 	"github.com/opencost/opencost/core/pkg/util"
 	"github.com/opencost/opencost/pkg/cloud/models"
@@ -531,6 +532,52 @@ func TestNodeCostAnnotations(t *testing.T) {
 	}
 }
 
+func TestCustomProviderGPUNodeUsesDefaultHourlyPricing(t *testing.T) {
+	configPath := t.TempDir()
+	t.Setenv(coreenv.ConfigPathEnvVar, configPath)
+
+	confMan := config.NewConfigFileManager(storage.NewFileStorage("/"))
+	customProvider := &provider.CustomProvider{
+		Config: provider.NewProviderConfig(confMan, "default.json"),
+	}
+	err := customProvider.DownloadPricingData()
+	require.NoError(t, err)
+
+	cfg, err := customProvider.GetConfig()
+	require.NoError(t, err)
+
+	costModel := &CostModel{
+		Provider: customProvider,
+		Cache: NewFakeNodeCache([]*clustercache.Node{
+			{
+				Name: "on-prem-gpu-node",
+				Labels: map[string]string{
+					"kubernetes.io/arch": "amd64",
+				},
+				Status: v1.NodeStatus{
+					Capacity: v1.ResourceList{
+						v1.ResourceCPU:    resource.MustParse("16"),
+						v1.ResourceMemory: resource.MustParse("128Gi"),
+						"nvidia.com/gpu":  resource.MustParse("2"),
+					},
+				},
+			},
+		}),
+	}
+
+	nodeCost, err := costModel.GetNodeCost()
+	require.NoError(t, err)
+
+	node, ok := nodeCost["on-prem-gpu-node"]
+	require.True(t, ok)
+
+	assert.Equal(t, cfg.CPU, node.VCPUCost)
+	assert.Equal(t, cfg.RAM, node.RAMCost)
+	assert.Equal(t, cfg.GPU, node.GPUCost)
+	assert.Equal(t, "2.000000", node.GPU)
+	assert.Empty(t, node.ProviderID)
+}
+
 // FakeNodeCache implements ClusterCache interface for testing
 type FakeNodeCache struct {
 	clustercache.ClusterCache