Browse Source

GPU Label Pricing (#3024)

* [Cluster Cache] Block to Pre-Initialize Lists (#3023)

* add blocking behavior on Run() to pre-initialize resource lists.

Signed-off-by: Matt Bolt <mbolt35@gmail.com>

* Ensure we don't request resources for specific option.

Signed-off-by: Matt Bolt <mbolt35@gmail.com>

---------

Signed-off-by: Matt Bolt <mbolt35@gmail.com>
Signed-off-by: Nik Willwerth <nwillwerth@kubecost.com>

* Added GPU label pricing fixes.

Signed-off-by: Nik Willwerth <nwillwerth@kubecost.com>

* Updated a unit test.

Signed-off-by: Nik Willwerth <nwillwerth@kubecost.com>

* Addressed PR feedback.

Signed-off-by: Nik Willwerth <nwillwerth@kubecost.com>

* Added unit test.

Signed-off-by: Nik Willwerth <nwillwerth@kubecost.com>

* Remove legacy filters among other dead code paths (#2878)

Signed-off-by: Sean Holcomb <seanholcomb@gmail.com>
Co-authored-by: Alex Meijer <ameijer@users.noreply.github.com>
Signed-off-by: Nik Willwerth <nwillwerth@kubecost.com>

* Empty commit.

Signed-off-by: Nik Willwerth <nwillwerth@kubecost.com>

---------

Signed-off-by: Matt Bolt <mbolt35@gmail.com>
Signed-off-by: Nik Willwerth <nwillwerth@kubecost.com>
Signed-off-by: Sean Holcomb <seanholcomb@gmail.com>
Co-authored-by: Matt Bolt <mbolt35@gmail.com>
Co-authored-by: Sean Holcomb <seanholcomb@gmail.com>
Co-authored-by: Alex Meijer <ameijer@users.noreply.github.com>
nik-kc 1 year ago
parent
commit
1ee7a89046

+ 2 - 0
configs/pricing_schema_gpu_labels.csv

@@ -0,0 +1,2 @@
+EndTimestamp,InstanceID,Region,AssetClass,InstanceIDField,InstanceType,MarketPriceHourly,Version
+2019-04-17 23:34:22 UTC,labelfoo,,gpulabel,foo,,0.75,

+ 4 - 0
pkg/cloud/alibaba/provider.go

@@ -530,6 +530,10 @@ func (alibaba *Alibaba) NodePricing(key models.Key) (*models.Node, models.Pricin
 	return returnNode, meta, nil
 	return returnNode, meta, nil
 }
 }
 
 
+func (alibaba *Alibaba) GpuPricing(nodeLabels map[string]string) (string, error) {
+	return "", nil
+}
+
 // PVPricing gives a pricing information of a specific PV given by PVkey
 // PVPricing gives a pricing information of a specific PV given by PVkey
 func (alibaba *Alibaba) PVPricing(pvk models.PVKey) (*models.PV, error) {
 func (alibaba *Alibaba) PVPricing(pvk models.PVKey) (*models.PV, error) {
 	alibaba.DownloadPricingDataLock.RLock()
 	alibaba.DownloadPricingDataLock.RLock()

+ 4 - 0
pkg/cloud/aws/provider.go

@@ -659,6 +659,10 @@ func (k *awsKey) getUsageType(labels map[string]string) string {
 	return ""
 	return ""
 }
 }
 
 
+func (awsProvider *AWS) GpuPricing(nodeLabels map[string]string) (string, error) {
+	return "", nil
+}
+
 func (aws *AWS) PVPricing(pvk models.PVKey) (*models.PV, error) {
 func (aws *AWS) PVPricing(pvk models.PVKey) (*models.PV, error) {
 	pricing, ok := aws.Pricing[pvk.Features()]
 	pricing, ok := aws.Pricing[pvk.Features()]
 	if !ok {
 	if !ok {

+ 4 - 0
pkg/cloud/azure/provider.go

@@ -1597,6 +1597,10 @@ func (az *Azure) ApplyReservedInstancePricing(nodes map[string]*models.Node) {
 
 
 }
 }
 
 
+func (az *Azure) GpuPricing(nodeLabels map[string]string) (string, error) {
+	return "", nil
+}
+
 func (az *Azure) PVPricing(pvk models.PVKey) (*models.PV, error) {
 func (az *Azure) PVPricing(pvk models.PVKey) (*models.PV, error) {
 	az.DownloadPricingDataLock.RLock()
 	az.DownloadPricingDataLock.RLock()
 	defer az.DownloadPricingDataLock.RUnlock()
 	defer az.DownloadPricingDataLock.RUnlock()

+ 4 - 0
pkg/cloud/gcp/provider.go

@@ -1157,6 +1157,10 @@ func (gcp *GCP) DownloadPricingData() error {
 	return nil
 	return nil
 }
 }
 
 
+func (gcp *GCP) GpuPricing(nodeLabels map[string]string) (string, error) {
+	return "", nil
+}
+
 func (gcp *GCP) PVPricing(pvk models.PVKey) (*models.PV, error) {
 func (gcp *GCP) PVPricing(pvk models.PVKey) (*models.PV, error) {
 	gcp.DownloadPricingDataLock.RLock()
 	gcp.DownloadPricingDataLock.RLock()
 	defer gcp.DownloadPricingDataLock.RUnlock()
 	defer gcp.DownloadPricingDataLock.RUnlock()

+ 1 - 0
pkg/cloud/models/models.go

@@ -309,6 +309,7 @@ type Provider interface {
 	GetDisks() ([]byte, error)
 	GetDisks() ([]byte, error)
 	GetOrphanedResources() ([]OrphanedResource, error)
 	GetOrphanedResources() ([]OrphanedResource, error)
 	NodePricing(Key) (*Node, PricingMetadata, error)
 	NodePricing(Key) (*Node, PricingMetadata, error)
+	GpuPricing(map[string]string) (string, error)
 	PVPricing(PVKey) (*PV, error)
 	PVPricing(PVKey) (*PV, error)
 	NetworkPricing() (*Network, error)           // TODO: add key interface arg for dynamic price fetching
 	NetworkPricing() (*Network, error)           // TODO: add key interface arg for dynamic price fetching
 	LoadBalancerPricing() (*LoadBalancer, error) // TODO: add key interface arg for dynamic price fetching
 	LoadBalancerPricing() (*LoadBalancer, error) // TODO: add key interface arg for dynamic price fetching

+ 4 - 0
pkg/cloud/oracle/provider.go

@@ -62,6 +62,10 @@ func (o *Oracle) NodePricing(key models.Key) (*models.Node, models.PricingMetada
 	return o.RateCardStore.ForKey(key, o.DefaultPricing)
 	return o.RateCardStore.ForKey(key, o.DefaultPricing)
 }
 }
 
 
+func (o *Oracle) GpuPricing(nodeLabels map[string]string) (string, error) {
+	return "", nil
+}
+
 func (o *Oracle) PVPricing(pvk models.PVKey) (*models.PV, error) {
 func (o *Oracle) PVPricing(pvk models.PVKey) (*models.PV, error) {
 	if err := o.ensurePricingData(); err != nil {
 	if err := o.ensurePricingData(); err != nil {
 		return nil, err
 		return nil, err

+ 5 - 0
pkg/cloud/otc/provider.go

@@ -512,6 +512,11 @@ func (otc *OTC) GetOrphanedResources() ([]models.OrphanedResource, error) {
 	return []models.OrphanedResource{}, nil
 	return []models.OrphanedResource{}, nil
 }
 }
 
 
+// TODO: Implement method
+func (otc *OTC) GpuPricing(nodeLabels map[string]string) (string, error) {
+	return "", nil
+}
+
 // TODO: Implement method
 // TODO: Implement method
 func (otc *OTC) AllNodePricing() (interface{}, error) {
 func (otc *OTC) AllNodePricing() (interface{}, error) {
 	return nil, nil
 	return nil, nil

+ 20 - 1
pkg/cloud/provider/csvprovider.go

@@ -39,6 +39,7 @@ type CSVProvider struct {
 	PricingPV               map[string]*price
 	PricingPV               map[string]*price
 	PVMapField              string
 	PVMapField              string
 	GPUClassPricing         map[string]*price
 	GPUClassPricing         map[string]*price
+	GPULabelPricing         map[string]*price
 	GPUMapFields            []string // Fields in a node's labels that represent the GPU class.
 	GPUMapFields            []string // Fields in a node's labels that represent the GPU class.
 	UsesRegion              bool
 	UsesRegion              bool
 	DownloadPricingDataLock sync.RWMutex
 	DownloadPricingDataLock sync.RWMutex
@@ -67,6 +68,7 @@ func (c *CSVProvider) DownloadPricingData() error {
 	nodeclasscount := make(map[string]float64)
 	nodeclasscount := make(map[string]float64)
 	pvpricing := make(map[string]*price)
 	pvpricing := make(map[string]*price)
 	gpupricing := make(map[string]*price)
 	gpupricing := make(map[string]*price)
+	gpulabelpricing := make(map[string]*price)
 	c.GPUMapFields = make([]string, 0, 1)
 	c.GPUMapFields = make([]string, 0, 1)
 	header, err := csvutil.Header(price{}, "csv")
 	header, err := csvutil.Header(price{}, "csv")
 	if err != nil {
 	if err != nil {
@@ -97,6 +99,7 @@ func (c *CSVProvider) DownloadPricingData() error {
 			c.NodeClassCount = nodeclasscount
 			c.NodeClassCount = nodeclasscount
 			c.PricingPV = pvpricing
 			c.PricingPV = pvpricing
 			c.GPUClassPricing = gpupricing
 			c.GPUClassPricing = gpupricing
+			c.GPULabelPricing = gpulabelpricing
 			return fmt.Errorf("Invalid s3 URI: %s", c.CSVLocation)
 			return fmt.Errorf("Invalid s3 URI: %s", c.CSVLocation)
 		}
 		}
 	} else {
 	} else {
@@ -109,6 +112,7 @@ func (c *CSVProvider) DownloadPricingData() error {
 		c.NodeClassCount = nodeclasscount
 		c.NodeClassCount = nodeclasscount
 		c.PricingPV = pvpricing
 		c.PricingPV = pvpricing
 		c.GPUClassPricing = gpupricing
 		c.GPUClassPricing = gpupricing
+		c.GPULabelPricing = gpulabelpricing
 		return nil
 		return nil
 	}
 	}
 	csvReader := csv.NewReader(csvr)
 	csvReader := csv.NewReader(csvr)
@@ -122,6 +126,7 @@ func (c *CSVProvider) DownloadPricingData() error {
 		c.NodeClassCount = nodeclasscount
 		c.NodeClassCount = nodeclasscount
 		c.PricingPV = pvpricing
 		c.PricingPV = pvpricing
 		c.GPUClassPricing = gpupricing
 		c.GPUClassPricing = gpupricing
+		c.GPULabelPricing = gpulabelpricing
 		return err
 		return err
 	}
 	}
 	for {
 	for {
@@ -178,18 +183,22 @@ func (c *CSVProvider) DownloadPricingData() error {
 		} else if p.AssetClass == "gpu" {
 		} else if p.AssetClass == "gpu" {
 			gpupricing[key] = &p
 			gpupricing[key] = &p
 			c.GPUMapFields = append(c.GPUMapFields, strings.ToLower(p.InstanceIDField))
 			c.GPUMapFields = append(c.GPUMapFields, strings.ToLower(p.InstanceIDField))
+		} else if p.AssetClass == "gpulabel" {
+			labelKeyValue := p.InstanceIDField + "=" + p.InstanceID
+			gpulabelpricing[labelKeyValue] = &p
 		} else {
 		} else {
 			log.Infof("Unrecognized asset class %s, defaulting to node", p.AssetClass)
 			log.Infof("Unrecognized asset class %s, defaulting to node", p.AssetClass)
 			pricing[key] = &p
 			pricing[key] = &p
 			c.NodeMapField = p.InstanceIDField
 			c.NodeMapField = p.InstanceIDField
 		}
 		}
 	}
 	}
-	if len(pricing) > 0 {
+	if len(pricing) > 0 || len(gpupricing) > 0 || len(gpulabelpricing) > 0 {
 		c.Pricing = pricing
 		c.Pricing = pricing
 		c.NodeClassPricing = nodeclasspricing
 		c.NodeClassPricing = nodeclasspricing
 		c.NodeClassCount = nodeclasscount
 		c.NodeClassCount = nodeclasscount
 		c.PricingPV = pvpricing
 		c.PricingPV = pvpricing
 		c.GPUClassPricing = gpupricing
 		c.GPUClassPricing = gpupricing
+		c.GPULabelPricing = gpulabelpricing
 	} else {
 	} else {
 		log.DedupedWarningf(5, "No data received from csv at %s", c.CSVLocation)
 		log.DedupedWarningf(5, "No data received from csv at %s", c.CSVLocation)
 	}
 	}
@@ -288,6 +297,16 @@ func (c *CSVProvider) NodePricing(key models.Key) (*models.Node, models.PricingM
 	return node, models.PricingMetadata{}, nil
 	return node, models.PricingMetadata{}, nil
 }
 }
 
 
+func (c *CSVProvider) GpuPricing(nodeLabels map[string]string) (string, error) {
+	for key, value := range nodeLabels {
+		labelKeyValue := key + "=" + value
+		if p, ok := c.GPULabelPricing[labelKeyValue]; ok {
+			return p.MarketPriceHourly, nil
+		}
+	}
+	return "", nil
+}
+
 func NodeValueFromMapField(m string, n *clustercache.Node, useRegion bool) string {
 func NodeValueFromMapField(m string, n *clustercache.Node, useRegion bool) string {
 	mf := strings.Split(m, ".")
 	mf := strings.Split(m, ".")
 	toReturn := ""
 	toReturn := ""

+ 4 - 0
pkg/cloud/provider/customprovider.go

@@ -261,6 +261,10 @@ func (*CustomProvider) QuerySQL(query string) ([]byte, error) {
 	return nil, nil
 	return nil, nil
 }
 }
 
 
+func (cp *CustomProvider) GpuPricing(nodeLabels map[string]string) (string, error) {
+	return "", nil
+}
+
 func (cp *CustomProvider) PVPricing(pvk models.PVKey) (*models.PV, error) {
 func (cp *CustomProvider) PVPricing(pvk models.PVKey) (*models.PV, error) {
 	cpricing, err := cp.Config.GetCustomPricingData()
 	cpricing, err := cp.Config.GetCustomPricingData()
 	if err != nil {
 	if err != nil {

+ 29 - 2
pkg/cloud/provider/provider.go

@@ -1,6 +1,7 @@
 package provider
 package provider
 
 
 import (
 import (
+	"context"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"net"
 	"net"
@@ -10,6 +11,7 @@ import (
 	"strings"
 	"strings"
 	"time"
 	"time"
 
 
+	"github.com/opencost/opencost/core/pkg/util/retry"
 	"github.com/opencost/opencost/pkg/cloud/alibaba"
 	"github.com/opencost/opencost/pkg/cloud/alibaba"
 	"github.com/opencost/opencost/pkg/cloud/aws"
 	"github.com/opencost/opencost/pkg/cloud/aws"
 	"github.com/opencost/opencost/pkg/cloud/azure"
 	"github.com/opencost/opencost/pkg/cloud/azure"
@@ -148,9 +150,24 @@ func ShareTenancyCosts(p models.Provider) bool {
 
 
 // NewProvider looks at the nodespec or provider metadata server to decide which provider to instantiate.
 // NewProvider looks at the nodespec or provider metadata server to decide which provider to instantiate.
 func NewProvider(cache clustercache.ClusterCache, apiKey string, config *config.ConfigFileManager) (models.Provider, error) {
 func NewProvider(cache clustercache.ClusterCache, apiKey string, config *config.ConfigFileManager) (models.Provider, error) {
-	nodes := cache.GetAllNodes()
+	getAllNodesFunc := func() ([]*clustercache.Node, error) {
+		nodes := cache.GetAllNodes()
+		if len(nodes) == 0 {
+			return nil, fmt.Errorf("no nodes found in cluster cache")
+		}
+		return nodes, nil
+	}
+
+	var nodes []*clustercache.Node
+	if !env.IsETLReadOnlyMode() {
+		// the error can be ignored because getAllNodesFunc only errors if nodes is empty, a case which we explicitly
+		// handle by checking the length of nodes below
+		nodes, _ = retry.Retry(context.Background(), getAllNodesFunc, 10, time.Second)
+	} else {
+		nodes, _ = getAllNodesFunc()
+	}
 	if len(nodes) == 0 {
 	if len(nodes) == 0 {
-		log.Infof("Could not locate any nodes for cluster.") // valid in ETL readonly mode
+		log.Infof("Could not locate any nodes for cluster.")
 		return &CustomProvider{
 		return &CustomProvider{
 			Clientset: cache,
 			Clientset: cache,
 			Config:    NewProviderConfig(config, "default.json"),
 			Config:    NewProviderConfig(config, "default.json"),
@@ -291,6 +308,7 @@ func getClusterProperties(node *clustercache.Node) clusterProperties {
 	if env.IsUseCustomProvider() {
 	if env.IsUseCustomProvider() {
 		// Use CSV provider if set
 		// Use CSV provider if set
 		if env.IsUseCSVProvider() {
 		if env.IsUseCSVProvider() {
+			log.Debug("using custom CSV provider")
 			cp.provider = opencost.CSVProvider
 			cp.provider = opencost.CSVProvider
 		}
 		}
 		return cp
 		return cp
@@ -298,34 +316,43 @@ func getClusterProperties(node *clustercache.Node) clusterProperties {
 
 
 	// The second conditional is mainly if you're running opencost outside of GCE, say in a local environment.
 	// The second conditional is mainly if you're running opencost outside of GCE, say in a local environment.
 	if metadata.OnGCE() || strings.HasPrefix(providerID, "gce") {
 	if metadata.OnGCE() || strings.HasPrefix(providerID, "gce") {
+		log.Debug("using GCP provider")
 		cp.provider = opencost.GCPProvider
 		cp.provider = opencost.GCPProvider
 		cp.configFileName = "gcp.json"
 		cp.configFileName = "gcp.json"
 		cp.projectID = gcp.ParseGCPProjectID(providerID)
 		cp.projectID = gcp.ParseGCPProjectID(providerID)
 	} else if strings.HasPrefix(providerID, "aws") {
 	} else if strings.HasPrefix(providerID, "aws") {
+		log.Debug("using AWS provider")
 		cp.provider = opencost.AWSProvider
 		cp.provider = opencost.AWSProvider
 		cp.configFileName = "aws.json"
 		cp.configFileName = "aws.json"
 	} else if strings.Contains(node.Status.NodeInfo.KubeletVersion, "eks") { // Additional check for EKS, via kubelet check
 	} else if strings.Contains(node.Status.NodeInfo.KubeletVersion, "eks") { // Additional check for EKS, via kubelet check
+		log.Debug("using AWS provider from EKS")
 		cp.provider = opencost.AWSProvider
 		cp.provider = opencost.AWSProvider
 		cp.configFileName = "aws.json"
 		cp.configFileName = "aws.json"
 	} else if strings.HasPrefix(providerID, "azure") {
 	} else if strings.HasPrefix(providerID, "azure") {
+		log.Debug("using Azure provider")
 		cp.provider = opencost.AzureProvider
 		cp.provider = opencost.AzureProvider
 		cp.configFileName = "azure.json"
 		cp.configFileName = "azure.json"
 		cp.accountID = azure.ParseAzureSubscriptionID(providerID)
 		cp.accountID = azure.ParseAzureSubscriptionID(providerID)
 	} else if strings.HasPrefix(providerID, "scaleway") { // the scaleway provider ID looks like scaleway://instance/<instance_id>
 	} else if strings.HasPrefix(providerID, "scaleway") { // the scaleway provider ID looks like scaleway://instance/<instance_id>
+		log.Debug("using Scaleway provider")
 		cp.provider = opencost.ScalewayProvider
 		cp.provider = opencost.ScalewayProvider
 		cp.configFileName = "scaleway.json"
 		cp.configFileName = "scaleway.json"
 	} else if strings.Contains(node.Status.NodeInfo.KubeletVersion, "aliyun") { // provider ID is not prefix with any distinct keyword like other providers
 	} else if strings.Contains(node.Status.NodeInfo.KubeletVersion, "aliyun") { // provider ID is not prefix with any distinct keyword like other providers
+		log.Debug("using Alibaba provider")
 		cp.provider = opencost.AlibabaProvider
 		cp.provider = opencost.AlibabaProvider
 		cp.configFileName = "alibaba.json"
 		cp.configFileName = "alibaba.json"
 	} else if strings.HasPrefix(providerID, "ocid") {
 	} else if strings.HasPrefix(providerID, "ocid") {
+		log.Debug("using Oracle provider")
 		cp.provider = opencost.OracleProvider
 		cp.provider = opencost.OracleProvider
 		cp.configFileName = "oracle.json"
 		cp.configFileName = "oracle.json"
 	} else if _, ok := node.Labels["cce.cloud.com/cce-nodepool"]; ok { // The node label "cce.cloud.com/cce-nodepool" exists
 	} else if _, ok := node.Labels["cce.cloud.com/cce-nodepool"]; ok { // The node label "cce.cloud.com/cce-nodepool" exists
+		log.Debug("using OTC provider")
 		cp.provider = opencost.OTCProvider
 		cp.provider = opencost.OTCProvider
 		cp.configFileName = "otc.json"
 		cp.configFileName = "otc.json"
 	}
 	}
 	// Override provider to CSV if CSVProvider is used and custom provider is not set
 	// Override provider to CSV if CSVProvider is used and custom provider is not set
 	if env.IsUseCSVProvider() {
 	if env.IsUseCSVProvider() {
+		log.Debug("using CSV provider")
 		cp.provider = opencost.CSVProvider
 		cp.provider = opencost.CSVProvider
 	}
 	}
 
 

+ 4 - 0
pkg/cloud/scaleway/provider.go

@@ -224,6 +224,10 @@ func (c *Scaleway) GetPVKey(pv *clustercache.PersistentVolume, parameters map[st
 	}
 	}
 }
 }
 
 
+func (c *Scaleway) GpuPricing(nodeLabels map[string]string) (string, error) {
+	return "", nil
+}
+
 func (c *Scaleway) PVPricing(pvk models.PVKey) (*models.PV, error) {
 func (c *Scaleway) PVPricing(pvk models.PVKey) (*models.PV, error) {
 	c.DownloadPricingDataLock.RLock()
 	c.DownloadPricingDataLock.RLock()
 	defer c.DownloadPricingDataLock.RUnlock()
 	defer c.DownloadPricingDataLock.RUnlock()

+ 1 - 29
pkg/costmodel/cluster_helpers.go

@@ -191,12 +191,6 @@ func buildGPUCostMap(
 	gpuCostMap := make(map[NodeIdentifier]float64)
 	gpuCostMap := make(map[NodeIdentifier]float64)
 	clusterAndNameToType := make(map[nodeIdentifierNoProviderID]string)
 	clusterAndNameToType := make(map[nodeIdentifierNoProviderID]string)
 
 
-	customPricingEnabled := provider.CustomPricesEnabled(cp)
-	customPricingConfig, err := cp.GetConfig()
-	if err != nil {
-		log.Warnf("ClusterNodes: failed to load custom pricing: %s", err)
-	}
-
 	for _, result := range resNodeGPUCost {
 	for _, result := range resNodeGPUCost {
 		cluster, err := result.GetString(env.GetPromClusterLabel())
 		cluster, err := result.GetString(env.GetPromClusterLabel())
 		if err != nil {
 		if err != nil {
@@ -222,33 +216,11 @@ func buildGPUCostMap(
 			Name:    name,
 			Name:    name,
 		}
 		}
 
 
-		var gpuCost float64
-
-		if customPricingEnabled && customPricingConfig != nil {
-
-			var customGPUStr string
-			if spot, ok := preemptible[key]; ok && spot {
-				customGPUStr = customPricingConfig.SpotGPU
-			} else {
-				customGPUStr = customPricingConfig.GPU
-			}
-
-			customGPUCost, err := strconv.ParseFloat(customGPUStr, 64)
-			if err != nil {
-				log.Warnf("ClusterNodes: error parsing custom GPU price: %s", customGPUStr)
-			}
-			gpuCost = customGPUCost
-
-		} else {
-
-			gpuCost = result.Values[0].Value
-
-		}
-
 		clusterAndNameToType[keyNon] = nodeType
 		clusterAndNameToType[keyNon] = nodeType
 
 
 		// If gpu count is available use it to multiply gpu cost
 		// If gpu count is available use it to multiply gpu cost
 		if value, ok := gpuCountMap[key]; ok {
 		if value, ok := gpuCountMap[key]; ok {
+			gpuCost := result.Values[0].Value
 			gpuCostMap[key] = gpuCost * value
 			gpuCostMap[key] = gpuCost * value
 		} else {
 		} else {
 			gpuCostMap[key] = 0
 			gpuCostMap[key] = 0

+ 1 - 1
pkg/costmodel/cluster_helpers_test.go

@@ -1066,7 +1066,7 @@ func TestAssetCustompricing(t *testing.T) {
 			expectedPricing: map[string]float64{
 			expectedPricing: map[string]float64{
 				"CPU":     0.027397,              // 20.0 / 730
 				"CPU":     0.027397,              // 20.0 / 730
 				"RAM":     5.102716386318207e-12, // 4.0 / 730 / 1024^3
 				"RAM":     5.102716386318207e-12, // 4.0 / 730 / 1024^3
-				"GPU":     1.369864,              // 500.0 / 730 * 2
+				"GPU":     1.0,                   // gpu costs cannot be set by a custom provider
 				"Storage": 0.000137,              // 0.1 / 730 * (1073741824.0 / 1024 / 1024 / 1024) * (60 / 60) => 0.1 / 730 * 1 * 1
 				"Storage": 0.000137,              // 0.1 / 730 * (1073741824.0 / 1024 / 1024 / 1024) * (60 / 60) => 0.1 / 730 * 1 * 1
 			},
 			},
 		},
 		},

+ 123 - 77
pkg/costmodel/costmodel.go

@@ -1102,6 +1102,38 @@ func (cm *CostModel) GetNodeCost(cp costAnalyzerCloud.Provider) (map[string]*cos
 				defaultCPUCorePrice = 0
 				defaultCPUCorePrice = 0
 			}
 			}
 
 
+			// Some customers may want GPU pricing to be determined by the labels affixed to their nodes. GpuPricing
+			// passes the node's labels to the provider, which then cross-references them with the labels that the
+			// provider knows to have label-specific costs associated with them, and returns that cost. See CSVProvider
+			// for an example implementation.
+			var gpuPrice float64
+			gpuPricing, err := cp.GpuPricing(nodeLabels)
+			if err != nil {
+				log.Errorf("Could not determine custom GPU pricing: %s", err)
+				gpuPrice = 0
+			} else if len(gpuPricing) > 0 {
+				gpuPrice, err = strconv.ParseFloat(gpuPricing, 64)
+				if err != nil {
+					log.Errorf("Could not parse custom GPU pricing: %s", err)
+					gpuPrice = 0
+				} else if math.IsNaN(gpuPrice) {
+					log.Warnf("Custom GPU pricing parsed as NaN. Setting to 0.")
+					gpuPrice = 0
+				} else {
+					log.Infof("Using custom GPU pricing for node \"%s\": %f", name, gpuPrice)
+				}
+			} else {
+				gpuPrice, err = strconv.ParseFloat(cfg.GPU, 64)
+				if err != nil {
+					log.Errorf("Could not parse default gpu price")
+					gpuPrice = 0
+				}
+				if math.IsNaN(gpuPrice) {
+					log.Warnf("defaultGPU parsed as NaN. Setting to 0.")
+					gpuPrice = 0
+				}
+			}
+
 			defaultRAMPrice, err := strconv.ParseFloat(cfg.RAM, 64)
 			defaultRAMPrice, err := strconv.ParseFloat(cfg.RAM, 64)
 			if err != nil {
 			if err != nil {
 				log.Errorf("Could not parse default ram price")
 				log.Errorf("Could not parse default ram price")
@@ -1123,13 +1155,13 @@ func (cm *CostModel) GetNodeCost(cp costAnalyzerCloud.Provider) (map[string]*cos
 			}
 			}
 			// Just say no to doing the ratios!
 			// Just say no to doing the ratios!
 			cpuCost := defaultCPUCorePrice * cpu
 			cpuCost := defaultCPUCorePrice * cpu
-			gpuCost := defaultGPUPrice * gpuc
+			gpuCost := gpuPrice * gpuc
 			ramCost := defaultRAMPrice * ram
 			ramCost := defaultRAMPrice * ram
 			nodeCost := cpuCost + gpuCost + ramCost
 			nodeCost := cpuCost + gpuCost + ramCost
 
 
 			newCnode.Cost = fmt.Sprintf("%f", nodeCost)
 			newCnode.Cost = fmt.Sprintf("%f", nodeCost)
 			newCnode.VCPUCost = fmt.Sprintf("%f", defaultCPUCorePrice)
 			newCnode.VCPUCost = fmt.Sprintf("%f", defaultCPUCorePrice)
-			newCnode.GPUCost = fmt.Sprintf("%f", defaultGPUPrice)
+			newCnode.GPUCost = fmt.Sprintf("%f", gpuPrice)
 			newCnode.RAMCost = fmt.Sprintf("%f", defaultRAMPrice)
 			newCnode.RAMCost = fmt.Sprintf("%f", defaultRAMPrice)
 			newCnode.RAMBytes = fmt.Sprintf("%f", ram)
 			newCnode.RAMBytes = fmt.Sprintf("%f", ram)
 
 
@@ -1143,95 +1175,109 @@ func (cm *CostModel) GetNodeCost(cp costAnalyzerCloud.Provider) (map[string]*cos
 			// cost among the CPU, RAM, and GPU.
 			// cost among the CPU, RAM, and GPU.
 			log.Tracef("GPU without cost found for %s, calculating...", cp.GetKey(nodeLabels, n).Features())
 			log.Tracef("GPU without cost found for %s, calculating...", cp.GetKey(nodeLabels, n).Features())
 
 
-			defaultCPU, err := strconv.ParseFloat(cfg.CPU, 64)
+			// Some customers may want GPU pricing to be determined by the labels affixed to their nodes. GpuPricing
+			// passes the node's labels to the provider, which then cross-references them with the labels that the
+			// provider knows to have label-specific costs associated with them, and returns that cost. See CSVProvider
+			// for an example implementation.
+			gpuPricing, err := cp.GpuPricing(nodeLabels)
 			if err != nil {
 			if err != nil {
-				log.Errorf("Could not parse default cpu price")
-				defaultCPU = 0
-			}
-			if math.IsNaN(defaultCPU) {
-				log.Warnf("defaultCPU parsed as NaN. Setting to 0.")
-				defaultCPU = 0
+				log.Errorf("Could not determine custom GPU pricing: %s", err)
+			} else if len(gpuPricing) > 0 {
+				newCnode.GPUCost = gpuPricing
+				log.Infof("Using custom GPU pricing for node \"%s\": %s", name, gpuPricing)
 			}
 			}
 
 
-			defaultRAM, err := strconv.ParseFloat(cfg.RAM, 64)
-			if err != nil {
-				log.Errorf("Could not parse default ram price")
-				defaultRAM = 0
-			}
-			if math.IsNaN(defaultRAM) {
-				log.Warnf("defaultRAM parsed as NaN. Setting to 0.")
-				defaultRAM = 0
-			}
+			if newCnode.GPUCost == "" {
+				defaultCPU, err := strconv.ParseFloat(cfg.CPU, 64)
+				if err != nil {
+					log.Errorf("Could not parse default cpu price")
+					defaultCPU = 0
+				}
+				if math.IsNaN(defaultCPU) {
+					log.Warnf("defaultCPU parsed as NaN. Setting to 0.")
+					defaultCPU = 0
+				}
 
 
-			defaultGPU, err := strconv.ParseFloat(cfg.GPU, 64)
-			if err != nil {
-				log.Errorf("Could not parse default gpu price")
-				defaultGPU = 0
-			}
-			if math.IsNaN(defaultGPU) {
-				log.Warnf("defaultGPU parsed as NaN. Setting to 0.")
-				defaultGPU = 0
-			}
+				defaultRAM, err := strconv.ParseFloat(cfg.RAM, 64)
+				if err != nil {
+					log.Errorf("Could not parse default ram price")
+					defaultRAM = 0
+				}
+				if math.IsNaN(defaultRAM) {
+					log.Warnf("defaultRAM parsed as NaN. Setting to 0.")
+					defaultRAM = 0
+				}
 
 
-			cpuToRAMRatio := defaultCPU / defaultRAM
-			if math.IsNaN(cpuToRAMRatio) {
-				log.Warnf("cpuToRAMRatio[defaultCPU: %f / defaultRAM: %f] is NaN. Setting to 10.", defaultCPU, defaultRAM)
-				cpuToRAMRatio = 10
-			}
+				defaultGPU, err := strconv.ParseFloat(cfg.GPU, 64)
+				if err != nil {
+					log.Errorf("Could not parse default gpu price")
+					defaultGPU = 0
+				}
+				if math.IsNaN(defaultGPU) {
+					log.Warnf("defaultGPU parsed as NaN. Setting to 0.")
+					defaultGPU = 0
+				}
 
 
-			gpuToRAMRatio := defaultGPU / defaultRAM
-			if math.IsNaN(gpuToRAMRatio) {
-				log.Warnf("gpuToRAMRatio is NaN. Setting to 100.")
-				gpuToRAMRatio = 100
-			}
+				cpuToRAMRatio := defaultCPU / defaultRAM
+				if math.IsNaN(cpuToRAMRatio) {
+					log.Warnf("cpuToRAMRatio[defaultCPU: %f / defaultRAM: %f] is NaN. Setting to 10.", defaultCPU, defaultRAM)
+					cpuToRAMRatio = 10
+				}
 
 
-			ramGB := ram / 1024 / 1024 / 1024
-			if math.IsNaN(ramGB) {
-				log.Warnf("ramGB is NaN. Setting to 0.")
-				ramGB = 0
-			}
+				gpuToRAMRatio := defaultGPU / defaultRAM
+				if math.IsNaN(gpuToRAMRatio) {
+					log.Warnf("gpuToRAMRatio is NaN. Setting to 100.")
+					gpuToRAMRatio = 100
+				}
 
 
-			ramMultiple := gpuc*gpuToRAMRatio + cpu*cpuToRAMRatio + ramGB
-			if math.IsNaN(ramMultiple) {
-				log.Warnf("ramMultiple is NaN. Setting to 0.")
-				ramMultiple = 0
-			}
+				ramGB := ram / 1024 / 1024 / 1024
+				if math.IsNaN(ramGB) {
+					log.Warnf("ramGB is NaN. Setting to 0.")
+					ramGB = 0
+				}
 
 
-			var nodePrice float64
-			if newCnode.Cost != "" {
-				nodePrice, err = strconv.ParseFloat(newCnode.Cost, 64)
-				if err != nil {
-					log.Errorf("Could not parse total node price")
-					return nil, err
+				ramMultiple := gpuc*gpuToRAMRatio + cpu*cpuToRAMRatio + ramGB
+				if math.IsNaN(ramMultiple) {
+					log.Warnf("ramMultiple is NaN. Setting to 0.")
+					ramMultiple = 0
 				}
 				}
-			} else if newCnode.VCPUCost != "" {
-				nodePrice, err = strconv.ParseFloat(newCnode.VCPUCost, 64) // all the price was allocated to the CPU
-				if err != nil {
-					log.Errorf("Could not parse node vcpu price")
-					return nil, err
+
+				var nodePrice float64
+				if newCnode.Cost != "" {
+					nodePrice, err = strconv.ParseFloat(newCnode.Cost, 64)
+					if err != nil {
+						log.Errorf("Could not parse total node price")
+						return nil, err
+					}
+				} else if newCnode.VCPUCost != "" {
+					nodePrice, err = strconv.ParseFloat(newCnode.VCPUCost, 64) // all the price was allocated to the CPU
+					if err != nil {
+						log.Errorf("Could not parse node vcpu price")
+						return nil, err
+					}
+				} else { // add case to use default pricing model when API data fails.
+					log.Debugf("No node price or CPUprice found, falling back to default")
+					nodePrice = defaultCPU*cpu + defaultRAM*ram + gpuc*defaultGPU
+				}
+				if math.IsNaN(nodePrice) {
+					log.Warnf("nodePrice parsed as NaN. Setting to 0.")
+					nodePrice = 0
 				}
 				}
-			} else { // add case to use default pricing model when API data fails.
-				log.Debugf("No node price or CPUprice found, falling back to default")
-				nodePrice = defaultCPU*cpu + defaultRAM*ram + gpuc*defaultGPU
-			}
-			if math.IsNaN(nodePrice) {
-				log.Warnf("nodePrice parsed as NaN. Setting to 0.")
-				nodePrice = 0
-			}
 
 
-			ramPrice := (nodePrice / ramMultiple)
-			if math.IsNaN(ramPrice) {
-				log.Warnf("ramPrice[nodePrice: %f / ramMultiple: %f] parsed as NaN. Setting to 0.", nodePrice, ramMultiple)
-				ramPrice = 0
-			}
+				ramPrice := (nodePrice / ramMultiple)
+				if math.IsNaN(ramPrice) {
+					log.Warnf("ramPrice[nodePrice: %f / ramMultiple: %f] parsed as NaN. Setting to 0.", nodePrice, ramMultiple)
+					ramPrice = 0
+				}
 
 
-			cpuPrice := ramPrice * cpuToRAMRatio
-			gpuPrice := ramPrice * gpuToRAMRatio
+				cpuPrice := ramPrice * cpuToRAMRatio
+				gpuPrice := ramPrice * gpuToRAMRatio
 
 
-			newCnode.VCPUCost = fmt.Sprintf("%f", cpuPrice)
-			newCnode.RAMCost = fmt.Sprintf("%f", ramPrice)
-			newCnode.RAMBytes = fmt.Sprintf("%f", ram)
-			newCnode.GPUCost = fmt.Sprintf("%f", gpuPrice)
+				newCnode.VCPUCost = fmt.Sprintf("%f", cpuPrice)
+				newCnode.RAMCost = fmt.Sprintf("%f", ramPrice)
+				newCnode.RAMBytes = fmt.Sprintf("%f", ram)
+				newCnode.GPUCost = fmt.Sprintf("%f", gpuPrice)
+			}
 		} else if newCnode.RAMCost == "" {
 		} else if newCnode.RAMCost == "" {
 			// We reach this when no RAM cost is defined in the OnDemand
 			// We reach this when no RAM cost is defined in the OnDemand
 			// pricing. It calculates a cpuToRAMRatio and ramMultiple to
 			// pricing. It calculates a cpuToRAMRatio and ramMultiple to

+ 91 - 0
test/cloud_test.go

@@ -236,6 +236,97 @@ func TestNodePriceFromCSVWithGPU(t *testing.T) {
 
 
 }
 }
 
 
+func TestNodePriceFromCSVWithGPULabels(t *testing.T) {
+	nameWant := "gke-standard-cluster-1-pool-1-91dc432d-cg69"
+	wantGPUCost := "0.75"
+
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
+	n := &clustercache.Node{}
+	n.SpecProviderID = "providerid"
+	n.Name = nameWant
+	n.Labels = make(map[string]string)
+	n.Labels["foo"] = "labelfoo"
+	n.Labels["nvidia.com/gpu_type"] = "Quadro_RTX_4000"
+	n.Status.Capacity = v1.ResourceList{"nvidia.com/gpu": *resource.NewScaledQuantity(2, 0)}
+
+	c := &provider.CSVProvider{
+		CSVLocation: "../configs/pricing_schema_gpu_labels.csv",
+		CustomProvider: &provider.CustomProvider{
+			Config: provider.NewProviderConfig(confMan, "../configs/default.json"),
+		},
+	}
+
+	c.DownloadPricingData()
+
+	fc := NewFakeNodeCache([]*clustercache.Node{n})
+	fm := FakeClusterMap{}
+	d, _ := time.ParseDuration("1m")
+
+	model := costmodel.NewCostModel(nil, nil, fc, fm, d)
+
+	nodeMap, err := model.GetNodeCost(c)
+	if err != nil {
+		t.Errorf("Error in NodePricing: %s", err.Error())
+	} else {
+		if node, ok := nodeMap[nameWant]; ok {
+			if node.GPUCost != wantGPUCost {
+				t.Errorf("Wanted gpu cost '%v' got gpu cost '%v'", wantGPUCost, node.GPUCost)
+			}
+		} else {
+			t.Errorf("Node %s not found in node map", nameWant)
+		}
+	}
+}
+
+func TestRKE2NodePriceFromCSVWithGPULabels(t *testing.T) {
+	nameWant := "gke-standard-cluster-1-pool-1-91dc432d-cg69"
+	wantGPUCost := "0.750000"
+
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
+	n := &clustercache.Node{}
+	n.SpecProviderID = "providerid"
+	n.Name = nameWant
+	n.Labels = make(map[string]string)
+	n.Labels["foo"] = "labelfoo"
+	n.Labels["nvidia.com/gpu_type"] = "Quadro_RTX_4000"
+	n.Labels[v1.LabelInstanceTypeStable] = "rke2"
+	n.Status.Capacity = v1.ResourceList{"nvidia.com/gpu": *resource.NewScaledQuantity(2, 0)}
+
+	c := &provider.CSVProvider{
+		CSVLocation: "../configs/pricing_schema_gpu_labels.csv",
+		CustomProvider: &provider.CustomProvider{
+			Config: provider.NewProviderConfig(confMan, "../configs/default.json"),
+		},
+	}
+
+	c.DownloadPricingData()
+
+	fc := NewFakeNodeCache([]*clustercache.Node{n})
+	fm := FakeClusterMap{}
+	d, _ := time.ParseDuration("1m")
+
+	model := costmodel.NewCostModel(nil, nil, fc, fm, d)
+
+	nodeMap, err := model.GetNodeCost(c)
+	if err != nil {
+		t.Errorf("Error in NodePricing: %s", err.Error())
+	} else {
+		if node, ok := nodeMap[nameWant]; ok {
+			if node.GPUCost != wantGPUCost {
+				t.Errorf("Wanted gpu cost '%v' got gpu cost '%v'", wantGPUCost, node.GPUCost)
+			}
+		} else {
+			t.Errorf("Node %s not found in node map", nameWant)
+		}
+	}
+}
+
 func TestNodePriceFromCSVSpecialChar(t *testing.T) {
 func TestNodePriceFromCSVSpecialChar(t *testing.T) {
 	nameWant := "gke-standard-cluster-1-pool-1-91dc432d-cg69"
 	nameWant := "gke-standard-cluster-1-pool-1-91dc432d-cg69"
 
 

+ 1 - 0
test/configs/default.json

@@ -0,0 +1 @@
+{"provider":"base","description":"Default prices based on GCP us-central1","CPU":"0.021811","spotCPU":"0.006543","RAM":"0.002923","spotRAM":"0.000877","GPU":"0.95","spotGPU":"0.308","storage":"0.00005479452","zoneNetworkEgress":"0.01","regionNetworkEgress":"0.01","internetNetworkEgress":"0.12","firstFiveForwardingRulesCost":"","additionalForwardingRuleCost":"","LBIngressDataCost":"","athenaBucketName":"","athenaRegion":"","athenaDatabase":"","athenaCatalog":"","athenaTable":"","athenaWorkgroup":"","masterPayerARN":"","customPricesEnabled":"false","defaultIdle":"","azureSubscriptionID":"","azureClientID":"","azureClientSecret":"","azureTenantID":"","azureBillingRegion":"","azureBillingAccount":"","azureOfferDurableID":"","azureStorageSubscriptionID":"","azureStorageAccount":"","azureStorageAccessKey":"","azureStorageContainer":"","azureContainerPath":"","azureCloud":"","currencyCode":"","discount":"","negotiatedDiscount":"","sharedOverhead":"","clusterName":"","sharedNamespaces":"","sharedLabelNames":"","sharedLabelValues":"","shareTenancyCosts":"true","readOnly":"","editorAccess":"","kubecostToken":"","googleAnalyticsTag":"","excludeProviderID":"","defaultLBPrice":""}