ソースを参照

Add Prometheus metrics for node pricing lookup tracking

Add opencost_node_pricing_lookup_total counter metric to track the
success/failure rate of node pricing lookups. This helps operators
identify when pricing data is falling back to defaults, which could
indicate configuration issues or cloud provider API problems.

The metric includes a "status" label with values:
- "success": Node pricing was retrieved successfully from the provider
- "fallback": Node pricing lookup failed, using fallback/default pricing

Tests include:
- Unit tests for RecordNodePricingLookup function (nil safety, both statuses)
- Metric definition validation tests
- Concurrency safety tests

Addresses #3549 (Part 4 - Prometheus metrics for node pricing lookups)

Signed-off-by: Claude <noreply@anthropic.com>
Claude 3 ヶ月 前
コミット
7162e4d7c1
3 ファイル変更197 行追加0 行削除
  1. 3 0
      pkg/costmodel/costmodel.go
  2. 17 0
      pkg/costmodel/metrics.go
  3. 177 0
      pkg/costmodel/metrics_test.go

+ 3 - 0
pkg/costmodel/costmodel.go

@@ -917,6 +917,7 @@ func (cm *CostModel) GetNodeCost() (map[string]*costAnalyzerCloud.Node, error) {
 		if err != nil {
 			log.Infof("Could not get node pricing for node %s. Falling back to default pricing", name)
 			log.Debugf("Error getting node pricing: %s", err.Error())
+			RecordNodePricingLookup("fallback")
 			if cnode != nil {
 				nodes[name] = cnode
 				continue
@@ -926,6 +927,8 @@ func (cm *CostModel) GetNodeCost() (map[string]*costAnalyzerCloud.Node, error) {
 					RAMCost:  cfg.RAM,
 				}
 			}
+		} else {
+			RecordNodePricingLookup("success")
 		}
 
 		pmd.PricingTypeCounts[cnode.PricingType]++

+ 17 - 0
pkg/costmodel/metrics.go

@@ -134,6 +134,7 @@ var (
 	networkInternetEgressCostG prometheus.Gauge
 	clusterManagementCostGv    *prometheus.GaugeVec
 	lbCostGv                   *prometheus.GaugeVec
+	nodePricingLookupTotal     *prometheus.CounterVec
 )
 
 // initCostModelMetrics uses a sync.Once to ensure that these metrics are only created once
@@ -273,6 +274,14 @@ func initCostModelMetrics(clusterInfo clusters.ClusterInfoProvider, metricsConfi
 			toRegisterGV = append(toRegisterGV, lbCostGv)
 		}
 
+		nodePricingLookupTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "opencost_node_pricing_lookup_total",
+			Help: "opencost_node_pricing_lookup_total Total node pricing lookups by status",
+		}, []string{"status"})
+		if _, disabled := disabledMetrics["opencost_node_pricing_lookup_total"]; !disabled {
+			prometheus.MustRegister(nodePricingLookupTotal)
+		}
+
 		// Register cost-model metrics for emission
 		for _, gv := range toRegisterGV {
 			prometheus.MustRegister(gv)
@@ -289,6 +298,14 @@ func initCostModelMetrics(clusterInfo clusters.ClusterInfoProvider, metricsConfi
 	})
 }
 
+// RecordNodePricingLookup safely records a node pricing lookup result.
+// Status should be "success" or "fallback".
+func RecordNodePricingLookup(status string) {
+	if nodePricingLookupTotal != nil {
+		nodePricingLookupTotal.WithLabelValues(status).Inc()
+	}
+}
+
 //--------------------------------------------------------------------------
 //  CostModelMetricsEmitter
 //--------------------------------------------------------------------------

+ 177 - 0
pkg/costmodel/metrics_test.go

@@ -0,0 +1,177 @@
+package costmodel
+
+import (
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestRecordNodePricingLookup(t *testing.T) {
+	// Save original and restore after test
+	original := nodePricingLookupTotal
+	defer func() { nodePricingLookupTotal = original }()
+
+	t.Run("nil counter does not panic", func(t *testing.T) {
+		nodePricingLookupTotal = nil
+		// Should not panic
+		RecordNodePricingLookup("success")
+		RecordNodePricingLookup("fallback")
+	})
+
+	t.Run("records success status", func(t *testing.T) {
+		nodePricingLookupTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "test_node_pricing_lookup_total",
+			Help: "Test metric",
+		}, []string{"status"})
+
+		RecordNodePricingLookup("success")
+		RecordNodePricingLookup("success")
+
+		metric := &dto.Metric{}
+		err := nodePricingLookupTotal.WithLabelValues("success").(prometheus.Counter).Write(metric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(2), metric.GetCounter().GetValue())
+	})
+
+	t.Run("records fallback status", func(t *testing.T) {
+		nodePricingLookupTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "test_node_pricing_lookup_total_2",
+			Help: "Test metric",
+		}, []string{"status"})
+
+		RecordNodePricingLookup("fallback")
+
+		metric := &dto.Metric{}
+		err := nodePricingLookupTotal.WithLabelValues("fallback").(prometheus.Counter).Write(metric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(1), metric.GetCounter().GetValue())
+	})
+
+	t.Run("tracks success and fallback separately", func(t *testing.T) {
+		nodePricingLookupTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "test_node_pricing_lookup_total_3",
+			Help: "Test metric",
+		}, []string{"status"})
+
+		RecordNodePricingLookup("success")
+		RecordNodePricingLookup("success")
+		RecordNodePricingLookup("success")
+		RecordNodePricingLookup("fallback")
+
+		successMetric := &dto.Metric{}
+		err := nodePricingLookupTotal.WithLabelValues("success").(prometheus.Counter).Write(successMetric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(3), successMetric.GetCounter().GetValue())
+
+		fallbackMetric := &dto.Metric{}
+		err = nodePricingLookupTotal.WithLabelValues("fallback").(prometheus.Counter).Write(fallbackMetric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(1), fallbackMetric.GetCounter().GetValue())
+	})
+}
+
+func TestNodePricingLookupMetricDefinition(t *testing.T) {
+	// Test that the metric can be created with the expected configuration
+	// matching the production definition in initCostModelMetrics
+	t.Run("metric created with correct name and labels", func(t *testing.T) {
+		counter := prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "opencost_node_pricing_lookup_total",
+			Help: "opencost_node_pricing_lookup_total Total node pricing lookups by status",
+		}, []string{"status"})
+
+		// Verify we can create counters with expected label values
+		successCounter := counter.WithLabelValues("success")
+		fallbackCounter := counter.WithLabelValues("fallback")
+
+		assert.NotNil(t, successCounter)
+		assert.NotNil(t, fallbackCounter)
+
+		// Verify the counters work correctly
+		successCounter.Inc()
+		fallbackCounter.Inc()
+		fallbackCounter.Inc()
+
+		successMetric := &dto.Metric{}
+		err := successCounter.(prometheus.Counter).Write(successMetric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(1), successMetric.GetCounter().GetValue())
+
+		fallbackMetric := &dto.Metric{}
+		err = fallbackCounter.(prometheus.Counter).Write(fallbackMetric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(2), fallbackMetric.GetCounter().GetValue())
+	})
+
+	t.Run("metric supports multiple increments", func(t *testing.T) {
+		counter := prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "opencost_node_pricing_lookup_total_multi",
+			Help: "Test metric",
+		}, []string{"status"})
+
+		// Simulate high-volume scenario
+		for i := 0; i < 100; i++ {
+			counter.WithLabelValues("success").Inc()
+		}
+		for i := 0; i < 25; i++ {
+			counter.WithLabelValues("fallback").Inc()
+		}
+
+		successMetric := &dto.Metric{}
+		err := counter.WithLabelValues("success").(prometheus.Counter).Write(successMetric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(100), successMetric.GetCounter().GetValue())
+
+		fallbackMetric := &dto.Metric{}
+		err = counter.WithLabelValues("fallback").(prometheus.Counter).Write(fallbackMetric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(25), fallbackMetric.GetCounter().GetValue())
+	})
+}
+
+func TestRecordNodePricingLookupConcurrency(t *testing.T) {
+	// Save original and restore after test
+	original := nodePricingLookupTotal
+	defer func() { nodePricingLookupTotal = original }()
+
+	t.Run("concurrent calls are safe", func(t *testing.T) {
+		nodePricingLookupTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "test_node_pricing_lookup_concurrent",
+			Help: "Test metric",
+		}, []string{"status"})
+
+		done := make(chan bool)
+		iterations := 100
+
+		// Concurrent success recordings
+		go func() {
+			for i := 0; i < iterations; i++ {
+				RecordNodePricingLookup("success")
+			}
+			done <- true
+		}()
+
+		// Concurrent fallback recordings
+		go func() {
+			for i := 0; i < iterations; i++ {
+				RecordNodePricingLookup("fallback")
+			}
+			done <- true
+		}()
+
+		// Wait for both goroutines
+		<-done
+		<-done
+
+		successMetric := &dto.Metric{}
+		err := nodePricingLookupTotal.WithLabelValues("success").(prometheus.Counter).Write(successMetric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(iterations), successMetric.GetCounter().GetValue())
+
+		fallbackMetric := &dto.Metric{}
+		err = nodePricingLookupTotal.WithLabelValues("fallback").(prometheus.Counter).Write(fallbackMetric)
+		assert.NoError(t, err)
+		assert.Equal(t, float64(iterations), fallbackMetric.GetCounter().GetValue())
+	})
+}