Kaynağa Gözat

Sync v1.117 (#3374)

Signed-off-by: sneax <paladesh600@gmail.com>
Signed-off-by: Matt Bolt <mbolt35@gmail.com>
Signed-off-by: segfault_bits <149077711+sneaxhuh@users.noreply.github.com>
Signed-off-by: Prateek Arora <1192prateek@gmail.com>
Signed-off-by: Sean Holcomb <seanholcomb@gmail.com>
Signed-off-by: Sparsh <sparsh.raj30@gmail.com>
Signed-off-by: Malthe Poulsen <malthe@grundtvigsvej.dk>
Signed-off-by: malpou <malthe@grundtvigsvej.dk>
Signed-off-by: Niko Kovacevic <nikovacevic@gmail.com>
Co-authored-by: segfault_bits <149077711+sneaxhuh@users.noreply.github.com>
Co-authored-by: Ishaan Mittal <ishaanmittal123@gmail.com>
Co-authored-by: prateek1192 <1192prateek@gmail.com>
Co-authored-by: Sean Holcomb <seanholcomb@gmail.com>
Co-authored-by: Sparsh Raj <49100336+spa-raj@users.noreply.github.com>
Co-authored-by: Malthe Poulsen <30603252+malpou@users.noreply.github.com>
Co-authored-by: Niko Kovacevic <nikovacevic@gmail.com>
Matt Bolt 7 ay önce
ebeveyn
işleme
163b87366b
87 değiştirilmiş dosya ile 7732 ekleme ve 448 silme
  1. 2 0
      .github/workflows/vulnerability-scan.yaml
  2. 5 0
      configs/digitalocean.json
  3. 0 2
      core/go.sum
  4. 20 0
      core/pkg/clustercache/clustercache.go
  5. 167 0
      core/pkg/collections/idnamemap.go
  6. 462 0
      core/pkg/collections/idnamemap_test.go
  7. 3 0
      core/pkg/diagnostics/diagnostics.go
  8. 3 3
      core/pkg/diagnostics/exporter/controller.go
  9. 1 1
      core/pkg/diagnostics/exporter/exporter.go
  10. 4 1
      core/pkg/diagnostics/exporter/source.go
  11. 10 0
      core/pkg/env/core.go
  12. 7 0
      core/pkg/env/env.go
  13. 5 5
      core/pkg/exporter/pathing/bingenpath.go
  14. 6 6
      core/pkg/exporter/pathing/path_test.go
  15. 2 2
      core/pkg/heartbeat/exporter/controller.go
  16. 1 1
      core/pkg/heartbeat/exporter/exporter.go
  17. 1 1
      core/pkg/heartbeat/exporter/heartbeat_test.go
  18. 20 1
      core/pkg/nodestats/nodestats.go
  19. 5 0
      core/pkg/opencost/assetprops.go
  20. 86 4
      core/pkg/protocol/http.go
  21. 109 1
      core/pkg/protocol/http_test.go
  22. 25 0
      core/pkg/storage/bucketstorage_test.go
  23. 21 13
      core/pkg/storage/storage.go
  24. 1 0
      modules/collector-source/go.mod
  25. 2 2
      modules/collector-source/go.sum
  26. 12 10
      modules/collector-source/pkg/collector/config.go
  27. 15 13
      modules/collector-source/pkg/collector/datasource.go
  28. 9 11
      modules/collector-source/pkg/env/collectorenv.go
  29. 38 0
      modules/collector-source/pkg/event/scrape.go
  30. 295 92
      modules/collector-source/pkg/metric/diagnostics.go
  31. 106 154
      modules/collector-source/pkg/metric/diagnostics_test.go
  32. 2 1
      modules/collector-source/pkg/metric/walinator.go
  33. 3 0
      modules/collector-source/pkg/metric/walinator_test.go
  34. 74 0
      modules/collector-source/pkg/scrape/clustercache.go
  35. 18 8
      modules/collector-source/pkg/scrape/dcgm.go
  36. 20 3
      modules/collector-source/pkg/scrape/network.go
  37. 2 0
      modules/collector-source/pkg/scrape/opencost.go
  38. 1 0
      modules/collector-source/pkg/scrape/scraper.go
  39. 23 0
      modules/collector-source/pkg/scrape/statsummary.go
  40. 29 2
      modules/collector-source/pkg/scrape/targetscraper.go
  41. 32 28
      modules/prometheus-source/pkg/prom/metricsquerier.go
  42. 29 0
      pkg/cloud/aws/athenaconfiguration.go
  43. 159 1
      pkg/cloud/aws/athenaconfiguration_test.go
  44. 21 1
      pkg/cloud/aws/athenaintegration.go
  45. 232 0
      pkg/cloud/aws/athenaintegration_test.go
  46. 26 0
      pkg/cloud/aws/athenaquerier.go
  47. 5 0
      pkg/cloud/aws/provider.go
  48. 2 0
      pkg/cloud/config/configurations_test.go
  49. 3 0
      pkg/cloud/config/controller_test.go
  50. 943 0
      pkg/cloud/digitalocean/provider.go
  51. 568 0
      pkg/cloud/digitalocean/provider_test.go
  52. 30 0
      pkg/cloud/digitalocean/testdata/do_pricing.json
  53. 7 1
      pkg/cloud/gcp/provider.go
  54. 12 0
      pkg/cloud/gcp/provider_test.go
  55. 1 0
      pkg/cloud/models/models.go
  56. 32 0
      pkg/cloud/models/models_test.go
  57. 14 0
      pkg/cloud/provider/provider.go
  58. 1 1
      pkg/cloudcost/ingestor.go
  59. 64 0
      pkg/currency/README.md
  60. 99 0
      pkg/currency/cache.go
  61. 170 0
      pkg/currency/cache_test.go
  62. 89 0
      pkg/currency/client.go
  63. 105 0
      pkg/currency/converter.go
  64. 228 0
      pkg/currency/converter_test.go
  65. 60 0
      pkg/currency/types.go
  66. 2 2
      pkg/customcost/ingestor.go
  67. 2 2
      pkg/customcost/querier.go
  68. 16 2
      pkg/env/cloudcost.go
  69. 7 13
      pkg/env/costmodel.go
  70. 30 6
      pkg/metrics/deploymentmetrics.go
  71. 541 0
      pkg/metrics/deploymentmetrics_test.go
  72. 12 4
      pkg/metrics/jobmetrics.go
  73. 91 0
      pkg/metrics/jobmetrics_test.go
  74. 29 12
      pkg/metrics/namespacemetrics.go
  75. 195 0
      pkg/metrics/namespacemetrics_test.go
  76. 71 18
      pkg/metrics/nodemetrics.go
  77. 454 0
      pkg/metrics/nodemetrics_test.go
  78. 10 2
      pkg/metrics/podmetrics.go
  79. 862 0
      pkg/metrics/podmetrics_test.go
  80. 19 4
      pkg/metrics/pvcmetrics.go
  81. 130 0
      pkg/metrics/pvcmetrics_test.go
  82. 30 7
      pkg/metrics/pvmetrics.go
  83. 124 0
      pkg/metrics/pvmetrics_test.go
  84. 10 2
      pkg/metrics/servicemetrics.go
  85. 233 0
      pkg/metrics/servicemetrics_test.go
  86. 15 5
      pkg/metrics/statefulsetmetrics.go
  87. 302 0
      pkg/metrics/statefulsetmetrics_test.go

+ 2 - 0
.github/workflows/vulnerability-scan.yaml

@@ -12,6 +12,8 @@ on:
   push:
     branches:
       - develop
+  merge_group:
+    types: [checks_requested]
 
 jobs:
   scan:

+ 5 - 0
configs/digitalocean.json

@@ -0,0 +1,5 @@
+{
+  "zoneNetworkEgress": "0.00",
+  "regionNetworkEgress": "0.00",
+  "internetNetworkEgress": "0.01"
+}

+ 0 - 2
core/go.sum

@@ -201,8 +201,6 @@ github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PU
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
-github.com/golang-jwt/jwt/v5 v5.2.1 h1:OuVbFODueb089Lh128TAcimifWaLhJwVflnrgM17wHk=
-github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
 github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
 github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=

+ 20 - 0
core/pkg/clustercache/clustercache.go

@@ -15,6 +15,7 @@ import (
 )
 
 type Namespace struct {
+	UID         types.UID
 	Name        string
 	Labels      map[string]string
 	Annotations map[string]string
@@ -51,6 +52,7 @@ type Container struct {
 }
 
 type Node struct {
+	UID            types.UID
 	Name           string
 	Labels         map[string]string
 	Annotations    map[string]string
@@ -59,6 +61,7 @@ type Node struct {
 }
 
 type Service struct {
+	UID          types.UID
 	Name         string
 	Namespace    string
 	SpecSelector map[string]string
@@ -75,6 +78,7 @@ type DaemonSet struct {
 }
 
 type Deployment struct {
+	UID                     types.UID
 	Name                    string
 	Namespace               string
 	Labels                  map[string]string
@@ -88,6 +92,7 @@ type Deployment struct {
 }
 
 type StatefulSet struct {
+	UID          types.UID
 	Name         string
 	Namespace    string
 	Labels       map[string]string
@@ -98,6 +103,7 @@ type StatefulSet struct {
 }
 
 type PersistentVolumeClaim struct {
+	UID         types.UID
 	Name        string
 	Namespace   string
 	Spec        v1.PersistentVolumeClaimSpec
@@ -116,12 +122,14 @@ type StorageClass struct {
 }
 
 type Job struct {
+	UID       types.UID
 	Name      string
 	Namespace string
 	Status    batchv1.JobStatus
 }
 
 type PersistentVolume struct {
+	UID         types.UID
 	Name        string
 	Namespace   string
 	Labels      map[string]string
@@ -144,6 +152,7 @@ type PodDisruptionBudget struct {
 }
 
 type ReplicaSet struct {
+	UID             types.UID
 	Name            string
 	Namespace       string
 	OwnerReferences []metav1.OwnerReference
@@ -181,6 +190,7 @@ func GetControllerOfNoCopy(pod *Pod) *metav1.OwnerReference {
 
 func TransformNamespace(input *v1.Namespace) *Namespace {
 	return &Namespace{
+		UID:         input.UID,
 		Name:        input.Name,
 		Annotations: input.Annotations,
 		Labels:      input.Labels,
@@ -241,6 +251,7 @@ func TransformPod(input *v1.Pod) *Pod {
 
 func TransformNode(input *v1.Node) *Node {
 	return &Node{
+		UID:            input.UID,
 		Name:           input.Name,
 		Labels:         input.Labels,
 		Annotations:    input.Annotations,
@@ -251,6 +262,7 @@ func TransformNode(input *v1.Node) *Node {
 
 func TransformService(input *v1.Service) *Service {
 	return &Service{
+		UID:          input.UID,
 		Name:         input.Name,
 		Namespace:    input.Namespace,
 		SpecSelector: input.Spec.Selector,
@@ -271,6 +283,7 @@ func TransformDaemonSet(input *appsv1.DaemonSet) *DaemonSet {
 
 func TransformDeployment(input *appsv1.Deployment) *Deployment {
 	return &Deployment{
+		UID:                     input.UID,
 		Name:                    input.Name,
 		Namespace:               input.Namespace,
 		Labels:                  input.Labels,
@@ -290,11 +303,15 @@ func TransformStatefulSet(input *appsv1.StatefulSet) *StatefulSet {
 		SpecSelector: input.Spec.Selector,
 		SpecReplicas: input.Spec.Replicas,
 		PodSpec:      TransformPodSpec(input.Spec.Template.Spec),
+		Labels:       input.Labels,
+		Annotations:  input.Annotations,
+		UID:          input.UID,
 	}
 }
 
 func TransformPersistentVolume(input *v1.PersistentVolume) *PersistentVolume {
 	return &PersistentVolume{
+		UID:         input.UID,
 		Name:        input.Name,
 		Namespace:   input.Namespace,
 		Labels:      input.Labels,
@@ -306,6 +323,7 @@ func TransformPersistentVolume(input *v1.PersistentVolume) *PersistentVolume {
 
 func TransformPersistentVolumeClaim(input *v1.PersistentVolumeClaim) *PersistentVolumeClaim {
 	return &PersistentVolumeClaim{
+		UID:         input.UID,
 		Name:        input.Name,
 		Namespace:   input.Namespace,
 		Spec:        input.Spec,
@@ -328,6 +346,7 @@ func TransformStorageClass(input *stv1.StorageClass) *StorageClass {
 
 func TransformJob(input *batchv1.Job) *Job {
 	return &Job{
+		UID:       input.UID,
 		Name:      input.Name,
 		Namespace: input.Namespace,
 		Status:    input.Status,
@@ -353,6 +372,7 @@ func TransformPodDisruptionBudget(input *policyv1.PodDisruptionBudget) *PodDisru
 
 func TransformReplicaSet(input *appsv1.ReplicaSet) *ReplicaSet {
 	return &ReplicaSet{
+		UID:             input.UID,
 		Name:            input.Name,
 		Namespace:       input.Namespace,
 		OwnerReferences: input.OwnerReferences,

+ 167 - 0
core/pkg/collections/idnamemap.go

@@ -0,0 +1,167 @@
+package collections
+
+import (
+	"errors"
+	"fmt"
+	"iter"
+)
+
+var (
+	// ErrEmptyID is returned when the provided entry into an IdNameMap returns an empty string
+	// for ID
+	ErrEmptyID error = errors.New("id must be non-empty")
+
+	// ErrEmptyName is returned when the provided entry into an IdNameMap returns an empty string
+	// for Name
+	ErrEmptyName error = errors.New("name must be non-empty")
+)
+
+// WithIdName is a generic constraint required for elements added to a `IdNameMap`
+type WithIdName interface {
+	Id() string
+	Name() string
+}
+
+// IdNameMap contains two maps which alias the same element by id and name. It provides O(1) lookups
+// by identifier or by name, both a required constraint on the `T` type.
+type IdNameMap[T WithIdName] struct {
+	m map[string]T
+	r map[string]T
+}
+
+func NewIdNameMap[T WithIdName]() *IdNameMap[T] {
+	return &IdNameMap[T]{
+		m: make(map[string]T),
+		r: make(map[string]T),
+	}
+}
+
+// Insert inserts a `T` instance into the map successfully under the following requirements:
+//
+// Insertion of new Entry:
+//  1. IDs and Name for the `T` instance must be non-empty.
+//  2. ID and Name must not partially overlap with an existing entry. This would happen if
+//     you attempted to insert a `T` with a unique ID, but a conflicting Name. Likewise,
+//     a unique name, but conflicting ID will also fail.
+//
+// Replacing an existing Entry:
+//  1. If there exists an old entry with the id of the new entry, then the name for the new
+//     entry must also point to the old entry.
+//  2. If there exists an old entry with the name of the new entry, then the id for the new
+//     entry must also point to the old entry.
+//
+// To summarize, you can replace an existing item as long as the id/name lookups for the entry
+// being replaced are the same.
+func (rm *IdNameMap[T]) Insert(item T) error {
+	id := item.Id()
+	if id == "" {
+		return ErrEmptyID
+	}
+
+	name := item.Name()
+	if name == "" {
+		return ErrEmptyName
+	}
+
+	oldForId, idExists := rm.m[id]
+	oldForName, nameExists := rm.r[name]
+
+	// check partial insertion of id
+	if idExists && !nameExists {
+		return fmt.Errorf(
+			"insertion of new entry: [id: %s, name: %s] would partially overwrite existing entry: [id: %s, name: %s]",
+			id,
+			name,
+			oldForId.Id(),
+			oldForId.Name(),
+		)
+	}
+
+	// check partial insertion of name
+	if !idExists && nameExists {
+		return fmt.Errorf(
+			"insertion of new entry: [id: %s, name: %s] would partially overwrite existing entry: [id: %s, name: %s]",
+			id,
+			name,
+			oldForName.Id(),
+			oldForName.Name(),
+		)
+	}
+
+	// if we are overwriting, check to ensure that the entities from each map have identical mappings
+	if idExists && nameExists {
+		if oldForId.Id() != oldForName.Id() || oldForId.Name() != oldForName.Name() {
+			return fmt.Errorf(
+				"attempting to overwrite entries [id: %s, name: %s] and [id: %s, name: %s] with new entry [id: %s, name: %s] creating a multi-entry conflict",
+				oldForId.Id(),
+				oldForId.Name(),
+				oldForName.Id(),
+				oldForName.Name(),
+				id,
+				name,
+			)
+		}
+	}
+
+	rm.m[id] = item
+	rm.r[name] = item
+
+	return nil
+}
+
+func (rm *IdNameMap[T]) ById(id string) (T, bool) {
+	item, ok := rm.m[id]
+	return item, ok
+}
+
+func (rm *IdNameMap[T]) ByName(name string) (T, bool) {
+	item, ok := rm.r[name]
+	return item, ok
+}
+
+func (rm *IdNameMap[T]) RemoveById(id string) bool {
+	item, ok := rm.ById(id)
+	if !ok {
+		return false
+	}
+
+	name := item.Name()
+	delete(rm.m, id)
+	delete(rm.r, name)
+
+	return true
+}
+
+func (rm *IdNameMap[T]) RemoveByName(name string) bool {
+	item, ok := rm.ByName(name)
+	if !ok {
+		return false
+	}
+
+	id := item.Id()
+	delete(rm.m, id)
+	delete(rm.r, name)
+
+	return true
+}
+
+func (rm *IdNameMap[T]) Keys() iter.Seq2[string, string] {
+	return func(yield func(string, string) bool) {
+		for id, value := range rm.m {
+			name := value.Name()
+			if !yield(id, name) {
+				return
+			}
+		}
+	}
+}
+
+func (rm *IdNameMap[T]) Values() iter.Seq[T] {
+	return func(yield func(T) bool) {
+		for _, value := range rm.m {
+			if !yield(value) {
+				return
+			}
+		}
+	}
+}

+ 462 - 0
core/pkg/collections/idnamemap_test.go

@@ -0,0 +1,462 @@
+package collections
+
+import (
+	"errors"
+	"testing"
+)
+
+type testItem struct {
+	id    string
+	name  string
+	value int
+}
+
+func (t testItem) Id() string {
+	return t.id
+}
+
+func (t testItem) Name() string {
+	return t.name
+}
+
+func TestNewIdNameMap(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	if m == nil {
+		t.Fatal("NewIdNameMap returned nil")
+	}
+	if m.m == nil {
+		t.Fatal("internal id map is nil")
+	}
+	if m.r == nil {
+		t.Fatal("internal name map is nil")
+	}
+}
+
+func TestIdNameMap_Insert(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	item := testItem{id: "id1", name: "name1"}
+
+	m.Insert(item)
+
+	// Verify item can be retrieved by id
+	retrieved, ok := m.ById("id1")
+	if !ok {
+		t.Fatal("item not found by id after insert")
+	}
+	if retrieved.Id() != "id1" || retrieved.Name() != "name1" {
+		t.Errorf("retrieved item mismatch: got %+v, want %+v", retrieved, item)
+	}
+
+	// Verify item can be retrieved by name
+	retrieved, ok = m.ByName("name1")
+	if !ok {
+		t.Fatal("item not found by name after insert")
+	}
+	if retrieved.Id() != "id1" || retrieved.Name() != "name1" {
+		t.Errorf("retrieved item mismatch: got %+v, want %+v", retrieved, item)
+	}
+}
+
+func TestIdNameMap_InsertEmptyIdAndName(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+
+	item1 := testItem{id: "", name: "name1"}
+	item2 := testItem{id: "id1", name: ""}
+
+	err := m.Insert(item1)
+	if err == nil {
+		t.Fatalf("Expected insertion failure, but succeeded!")
+	}
+	if !errors.Is(err, ErrEmptyID) {
+		t.Fatalf("Expected ErrEmptyID, but instead got: %s", err.Error())
+	}
+
+	err = m.Insert(item2)
+	if err == nil {
+		t.Fatalf("Expected insertion failure, but succeeded!")
+	}
+	if !errors.Is(err, ErrEmptyName) {
+		t.Fatalf("Expected ErrEmptyName, but instead got: %s", err.Error())
+	}
+}
+
+func TestIdNameMap_InsertDuplicateId(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	item1 := testItem{id: "id1", name: "name1"}
+	item2 := testItem{id: "id1", name: "name2"}
+	item3 := testItem{id: "id1", name: "name1", value: 10}
+
+	err := m.Insert(item1)
+	if err != nil {
+		t.Fatalf("%s", err.Error())
+		return
+	}
+
+	// should fail
+	err = m.Insert(item2)
+	if err == nil {
+		t.Fatalf("Expected insertion to fail, but it succeeded!")
+	} else {
+		t.Logf("%s", err.Error())
+	}
+
+	err = m.Insert(item3)
+	if err != nil {
+		t.Fatalf("%s", err.Error())
+	}
+
+	// Verify second item overwrote the first
+	retrieved, ok := m.ById("id1")
+	if !ok {
+		t.Fatal("item not found by id")
+	}
+	if retrieved.Name() != "name1" {
+		t.Errorf("expected name1, got %s", retrieved.Name())
+	}
+
+	// Failed name should not exist
+	_, ok = m.ByName("name2")
+	if ok {
+		t.Error("old name should not exist after overwrite")
+	}
+
+	// Second name should exist
+	retrieved, ok = m.ByName("name1")
+	if !ok {
+		t.Fatal("new name not found")
+	}
+	if retrieved.Id() != "id1" {
+		t.Errorf("expected id1, got %s", retrieved.Id())
+	}
+	if retrieved.value != 10 {
+		t.Errorf("expected 10, got: %d", retrieved.value)
+	}
+}
+
+func TestIdNameMap_InsertDuplicateName(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	item1 := testItem{id: "id1", name: "name1"}
+	item2 := testItem{id: "id2", name: "name1"}
+	item3 := testItem{id: "id1", name: "name1", value: 10}
+
+	err := m.Insert(item1)
+	if err != nil {
+		t.Fatalf("%s", err.Error())
+		return
+	}
+
+	// expect error
+	err = m.Insert(item2)
+	if err == nil {
+		t.Fatalf("Expected insertion to fail, but it succeeded!")
+	} else {
+		t.Log(err)
+	}
+
+	// overwrite
+	err = m.Insert(item3)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify second item overwrote the first by name
+	retrieved, ok := m.ByName("name1")
+	if !ok {
+		t.Fatal("item not found by name")
+	}
+	if retrieved.Id() != "id1" {
+		t.Errorf("expected id1, got %s", retrieved.Id())
+	}
+
+	// id2 shouldn't exist
+	_, ok = m.ById("id2")
+	if ok {
+		t.Error("id2 failed to insert, so should not exist")
+	}
+
+	retrieved, ok = m.ById("id1")
+	if !ok {
+		t.Fatal("id1 not found")
+	}
+	if retrieved.Name() != "name1" {
+		t.Errorf("expected name1, got %s", retrieved.Name())
+	}
+	if retrieved.value != 10 {
+		t.Errorf("expected value: 10, got %d", retrieved.value)
+	}
+}
+
+func TestIdNameMap_InsertTrickyPartialOverwrites(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+
+	// 2 unique items
+	item1 := testItem{id: "id1", name: "foo"}
+	item2 := testItem{id: "id2", name: "bar"}
+
+	// item overlaps id from previous entry, and name from another previous entry
+	item3 := testItem{id: "id1", name: "bar"}
+
+	err := m.Insert(item1)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = m.Insert(item2)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = m.Insert(item3)
+	if err == nil {
+		t.Fatalf("Expected to fail, but insert succeeded!")
+	} else {
+		t.Log(err)
+	}
+
+	if !m.RemoveById("id1") {
+		t.Fatalf("Expected to remove entry with id: 'id1', but failed")
+	}
+
+	// this will _still_ be a partial insert, so expect failure
+	err = m.Insert(item3)
+	if err == nil {
+		t.Fatalf("Expected to fail, but insert succeeded!")
+	} else {
+		t.Log(err)
+	}
+
+	if !m.RemoveByName("bar") {
+		t.Fatalf("Expected to remove entry with name: 'bar', but failed")
+	}
+
+	// this should now succeed!
+	err = m.Insert(item3)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestIdNameMap_ByIdNotFound(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+
+	_, ok := m.ById("nonexistent")
+	if ok {
+		t.Error("expected false for nonexistent id")
+	}
+}
+
+func TestIdNameMap_ByNameNotFound(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+
+	_, ok := m.ByName("nonexistent")
+	if ok {
+		t.Error("expected false for nonexistent name")
+	}
+}
+
+func TestIdNameMap_RemoveById(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	item := testItem{id: "id1", name: "name1"}
+
+	m.Insert(item)
+
+	// Verify item exists
+	_, ok := m.ById("id1")
+	if !ok {
+		t.Fatal("item should exist before removal")
+	}
+	_, ok = m.ByName("name1")
+	if !ok {
+		t.Fatal("item should exist before removal")
+	}
+
+	// Remove by id
+	removed := m.RemoveById("id1")
+	if !removed {
+		t.Error("RemoveById should return true for existing item")
+	}
+
+	// Verify item no longer exists
+	_, ok = m.ById("id1")
+	if ok {
+		t.Error("item should not exist after removal by id")
+	}
+	_, ok = m.ByName("name1")
+	if ok {
+		t.Error("item should not exist after removal by id")
+	}
+}
+
+func TestIdNameMap_RemoveByIdNotFound(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+
+	removed := m.RemoveById("nonexistent")
+	if removed {
+		t.Error("RemoveById should return false for nonexistent item")
+	}
+}
+
+func TestIdNameMap_RemoveByName(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	item := testItem{id: "id1", name: "name1"}
+
+	m.Insert(item)
+
+	// Verify item exists
+	_, ok := m.ById("id1")
+	if !ok {
+		t.Fatal("item should exist before removal")
+	}
+	_, ok = m.ByName("name1")
+	if !ok {
+		t.Fatal("item should exist before removal")
+	}
+
+	// Remove by name
+	removed := m.RemoveByName("name1")
+	if !removed {
+		t.Error("RemoveByName should return true for existing item")
+	}
+
+	// Verify item no longer exists
+	_, ok = m.ById("id1")
+	if ok {
+		t.Error("item should not exist after removal by name")
+	}
+	_, ok = m.ByName("name1")
+	if ok {
+		t.Error("item should not exist after removal by name")
+	}
+}
+
+func TestIdNameMap_RemoveByNameNotFound(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+
+	removed := m.RemoveByName("nonexistent")
+	if removed {
+		t.Error("RemoveByName should return false for nonexistent item")
+	}
+}
+
+func TestIdNameMap_Keys(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	items := []testItem{
+		{id: "id1", name: "name1"},
+		{id: "id2", name: "name2"},
+		{id: "id3", name: "name3"},
+	}
+
+	for _, item := range items {
+		m.Insert(item)
+	}
+
+	keys := make(map[string]string)
+	for id, name := range m.Keys() {
+		keys[id] = name
+	}
+
+	if len(keys) != 3 {
+		t.Errorf("expected 3 keys, got %d", len(keys))
+	}
+
+	for _, item := range items {
+		name, ok := keys[item.id]
+		if !ok {
+			t.Errorf("missing key for id %s", item.id)
+		}
+		if name != item.name {
+			t.Errorf("wrong name for id %s: got %s, want %s", item.id, name, item.name)
+		}
+	}
+}
+
+func TestIdNameMap_Values(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	items := []testItem{
+		{id: "id1", name: "name1"},
+		{id: "id2", name: "name2"},
+		{id: "id3", name: "name3"},
+	}
+
+	for _, item := range items {
+		m.Insert(item)
+	}
+
+	values := make(map[string]testItem)
+	for value := range m.Values() {
+		values[value.Id()] = value
+	}
+
+	if len(values) != 3 {
+		t.Errorf("expected 3 values, got %d", len(values))
+	}
+
+	for _, item := range items {
+		value, ok := values[item.id]
+		if !ok {
+			t.Errorf("missing value for id %s", item.id)
+		}
+		if value.Id() != item.id || value.Name() != item.name {
+			t.Errorf("wrong value for id %s: got %+v, want %+v", item.id, value, item)
+		}
+	}
+}
+
+func TestIdNameMap_EmptyIterators(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+
+	// Test empty Keys iterator
+	count := 0
+	for range m.Keys() {
+		count++
+	}
+	if count != 0 {
+		t.Errorf("expected 0 keys in empty map, got %d", count)
+	}
+
+	// Test empty Values iterator
+	count = 0
+	for range m.Values() {
+		count++
+	}
+	if count != 0 {
+		t.Errorf("expected 0 values in empty map, got %d", count)
+	}
+}
+
+func TestIdNameMap_IteratorEarlyBreak(t *testing.T) {
+	m := NewIdNameMap[testItem]()
+	items := []testItem{
+		{id: "id1", name: "name1"},
+		{id: "id2", name: "name2"},
+		{id: "id3", name: "name3"},
+	}
+
+	for _, item := range items {
+		m.Insert(item)
+	}
+
+	// Test early break in Keys iterator
+	count := 0
+	for range m.Keys() {
+		count++
+		if count == 1 {
+			break
+		}
+	}
+	if count != 1 {
+		t.Errorf("expected early break to work in Keys iterator, got count %d", count)
+	}
+
+	// Test early break in Values iterator
+	count = 0
+	for range m.Values() {
+		count++
+		if count == 1 {
+			break
+		}
+	}
+	if count != 1 {
+		t.Errorf("expected early break to work in Values iterator, got count %d", count)
+	}
+}

+ 3 - 0
core/pkg/diagnostics/diagnostics.go

@@ -37,6 +37,9 @@ type DiagnosticResult struct {
 
 // DiagnosticsRunReport is a struct that contains the start time of the diagnostics run, and all of the results.
 type DiagnosticsRunReport struct {
+	// Application contains the name of the application that the diagnostics run belongs to.
+	Application string `json:"application"`
+
 	// StartTime contains the time when the full diagnostics run started
 	StartTime time.Time `json:"startTime"`
 

+ 3 - 3
core/pkg/diagnostics/exporter/controller.go

@@ -8,13 +8,13 @@ import (
 
 // NewDiagnosticsExportController creates a new EventExportController for DiagnosticsRunReport events.
 func NewDiagnosticsExportController(
-	clusterId string,
 	applicationName string,
+	clusterId string,
 	store storage.Storage,
 	service diagnostics.DiagnosticService,
 ) *exporter.EventExportController[diagnostics.DiagnosticsRunReport] {
 	return exporter.NewEventExportController(
-		NewDiagnosticSource(service),
-		NewDiagnosticExporter(clusterId, applicationName, store),
+		NewDiagnosticSource(applicationName, service),
+		NewDiagnosticExporter(applicationName, clusterId, store),
 	)
 }

+ 1 - 1
core/pkg/diagnostics/exporter/exporter.go

@@ -9,7 +9,7 @@ import (
 )
 
 // NewDiagnosticExporter creates a new `StorageExporter[DiagnosticsRunReport]` instance for exporting diagnostic run events.
-func NewDiagnosticExporter(clusterId string, applicationName string, storage storage.Storage) exporter.EventExporter[diagnostics.DiagnosticsRunReport] {
+func NewDiagnosticExporter(applicationName string, clusterId string, storage storage.Storage) exporter.EventExporter[diagnostics.DiagnosticsRunReport] {
 	pathing, err := pathing.NewEventStoragePathFormatter(applicationName, clusterId, diagnostics.DiagnosticsEventName)
 	if err != nil {
 		log.Errorf("failed to create pathing formatter: %v", err)

+ 4 - 1
core/pkg/diagnostics/exporter/source.go

@@ -9,13 +9,15 @@ import (
 
 // DiagnosticSource is an `export.ExportSource` implementation that provides the basic data for a `DiagnosticResult` payload.
 type DiagnosticSource struct {
+	applicationName string
 	diagnosticService diagnostics.DiagnosticService
 }
 
 // NewDiagnosticSource creates a new `DiagnosticSource` instance. It accepts the `DiagnosticService` implementation
 // that will be used to retrieve the diagnostic results.
-func NewDiagnosticSource(diagnosticService diagnostics.DiagnosticService) *DiagnosticSource {
+func NewDiagnosticSource(applicationName string, diagnosticService diagnostics.DiagnosticService) *DiagnosticSource {
 	return &DiagnosticSource{
+		applicationName: applicationName,
 		diagnosticService: diagnosticService,
 	}
 }
@@ -31,6 +33,7 @@ func (ds *DiagnosticSource) Make(t time.Time) *diagnostics.DiagnosticsRunReport
 
 	return &diagnostics.DiagnosticsRunReport{
 		StartTime: t,
+		Application: ds.applicationName,
 		Results:   ds.diagnosticService.Run(ctx),
 	}
 }

+ 10 - 0
core/pkg/env/core.go

@@ -10,11 +10,16 @@ const DefaultStorageFile = "federated-store.yaml"
 const (
 	APIPortEnvVar    = "API_PORT"
 	ClusterIDEnvVar  = "CLUSTER_ID"
+	AppNameEnvVar    = "APP_NAME"
 	ConfigPathEnvVar = "CONFIG_PATH"
 
 	PProfEnabledEnvVar = "PPROF_ENABLED"
 
 	InstallNamespaceEnvVar = "INSTALL_NAMESPACE"
+
+	Resolution1dRetentionEnvVar  = "RESOLUTION_1D_RETENTION"  // int: number of days
+	Resolution1hRetentionEnvVar  = "RESOLUTION_1H_RETENTION"  // int: number of hours
+	Resolution10mRetentionEnvVar = "RESOLUTION_10M_RETENTION" // int: number of 10m segments
 )
 
 // GetAPIPort returns the environment variable value for APIPortEnvVar which
@@ -29,6 +34,11 @@ func GetClusterID() string {
 	return Get(ClusterIDEnvVar, "")
 }
 
+// GetAppName returns the name of the application name running the values
+func GetAppName() string {
+	return Get(AppNameEnvVar, "Opencost")
+}
+
 // GetConfigPath returns the environment variable value for ConfigPathEnvVar which represents the cost
 // model configuration path
 func GetConfigPath() string {

+ 7 - 0
core/pkg/env/env.go

@@ -198,3 +198,10 @@ func SetBool(key string, value bool) error {
 func SetDuration(key string, value time.Duration) error {
 	return envMapper.SetDuration(key, value)
 }
+
+// GetPrefixInt parses an int from the environment variable key parameter. It first checks the env var with the prefix
+// then checks the env var without the prefix If the environment variable is empty or fails to parse, the defaultValue
+// parameter is returned.
+func GetPrefixInt(prefix, key string, defaultValue int) int {
+	return envMapper.GetInt(prefix+key, envMapper.GetInt(key, defaultValue))
+}

+ 5 - 5
core/pkg/exporter/pathing/bingenpath.go

@@ -11,8 +11,8 @@ import (
 )
 
 const (
-	defaultRootDir string = "federated"
-	baseStorageDir string = "etl/bingen"
+	DefaultRootDir string = "federated"
+	BaseStorageDir string = "etl/bingen"
 )
 
 // BingenStoragePathFormatter is an implementation of the StoragePathFormatter interface for
@@ -27,7 +27,7 @@ type BingenStoragePathFormatter struct {
 }
 
 func NewDefaultStoragePathFormatter(clusterId, pipeline string, resolution *time.Duration) (StoragePathFormatter[opencost.Window], error) {
-	return NewBingenStoragePathFormatter(defaultRootDir, clusterId, pipeline, resolution)
+	return NewBingenStoragePathFormatter(DefaultRootDir, clusterId, pipeline, resolution)
 }
 
 // NewBingenStoragePathFormatter creates a StoragePathFormatter for a cluster separated storage path
@@ -65,7 +65,7 @@ func (bsf *BingenStoragePathFormatter) Dir() string {
 	return path.Join(
 		bsf.rootDir,
 		bsf.clusterId,
-		baseStorageDir,
+		BaseStorageDir,
 		bsf.pipeline,
 		bsf.resolution,
 	)
@@ -80,7 +80,7 @@ func (bsf *BingenStoragePathFormatter) ToFullPath(prefix string, window opencost
 	return path.Join(
 		bsf.rootDir,
 		bsf.clusterId,
-		baseStorageDir,
+		BaseStorageDir,
 		bsf.pipeline,
 		bsf.resolution,
 		fileName,

+ 6 - 6
core/pkg/exporter/pathing/path_test.go

@@ -25,7 +25,7 @@ func TestBingenPathFormatter(t *testing.T) {
 			pipeline:   "allocation",
 			resolution: nil,
 			prefix:     "",
-			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1704110400-1704114000", defaultRootDir, baseStorageDir),
+			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1704110400-1704114000", DefaultRootDir, BaseStorageDir),
 		},
 		{
 			name:       "with resolution",
@@ -33,7 +33,7 @@ func TestBingenPathFormatter(t *testing.T) {
 			pipeline:   "allocation",
 			resolution: &[]time.Duration{1 * time.Hour}[0],
 			prefix:     "",
-			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1h/1704110400-1704114000", defaultRootDir, baseStorageDir),
+			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1h/1704110400-1704114000", DefaultRootDir, BaseStorageDir),
 		},
 		{
 			name:       "no resolution with prefix",
@@ -41,7 +41,7 @@ func TestBingenPathFormatter(t *testing.T) {
 			pipeline:   "allocation",
 			resolution: nil,
 			prefix:     "test",
-			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/test.1704110400-1704114000", defaultRootDir, baseStorageDir),
+			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/test.1704110400-1704114000", DefaultRootDir, BaseStorageDir),
 		},
 		{
 			name:       "with resolution with prefix",
@@ -49,7 +49,7 @@ func TestBingenPathFormatter(t *testing.T) {
 			pipeline:   "allocation",
 			resolution: &[]time.Duration{1 * time.Hour}[0],
 			prefix:     "test",
-			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1h/test.1704110400-1704114000", defaultRootDir, baseStorageDir),
+			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1h/test.1704110400-1704114000", DefaultRootDir, BaseStorageDir),
 		},
 		{
 			name:       "daily resolution",
@@ -57,7 +57,7 @@ func TestBingenPathFormatter(t *testing.T) {
 			pipeline:   "allocation",
 			resolution: &[]time.Duration{24 * time.Hour}[0],
 			prefix:     "",
-			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1d/1704110400-1704196800", defaultRootDir, baseStorageDir),
+			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1d/1704110400-1704196800", DefaultRootDir, BaseStorageDir),
 		},
 		{
 			name:       "weekly resolution",
@@ -65,7 +65,7 @@ func TestBingenPathFormatter(t *testing.T) {
 			pipeline:   "allocation",
 			resolution: &[]time.Duration{7 * 24 * time.Hour}[0],
 			prefix:     "",
-			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1w/1704110400-1704715200", defaultRootDir, baseStorageDir),
+			expected:   fmt.Sprintf("%s/cluster-a/%s/allocation/1w/1704110400-1704715200", DefaultRootDir, BaseStorageDir),
 		},
 	}
 

+ 2 - 2
core/pkg/heartbeat/exporter/controller.go

@@ -9,14 +9,14 @@ import (
 // NewHeartbeatExportController creates a new EventExportController for Heartbeat events.
 // A HeartbeatMetadataProvider can optionally be provided to append metadata to the Heartbeat payload.
 func NewHeartbeatExportController(
-	clusterId string,
 	applicationName string,
+	clusterId string,
 	version string,
 	store storage.Storage,
 	provider HeartbeatMetadataProvider,
 ) *exporter.EventExportController[heartbeat.Heartbeat] {
 	return exporter.NewEventExportController(
 		NewHeartbeatSource(applicationName, version, provider),
-		NewHeartbeatExporter(clusterId, applicationName, store),
+		NewHeartbeatExporter(applicationName, clusterId, store),
 	)
 }

+ 1 - 1
core/pkg/heartbeat/exporter/exporter.go

@@ -9,7 +9,7 @@ import (
 )
 
 // NewHeartbeatExporter creates a new `StorageExporter[Heartbeat]` instance for exporting Heartbeat events.
-func NewHeartbeatExporter(clusterId string, applicationName string, storage storage.Storage) exporter.EventExporter[heartbeat.Heartbeat] {
+func NewHeartbeatExporter(applicationName string, clusterId string, storage storage.Storage) exporter.EventExporter[heartbeat.Heartbeat] {
 	pathing, err := pathing.NewEventStoragePathFormatter(applicationName, clusterId, heartbeat.HeartbeatEventName)
 	if err != nil {
 		log.Errorf("failed to create pathing formatter: %v", err)

+ 1 - 1
core/pkg/heartbeat/exporter/heartbeat_test.go

@@ -38,7 +38,7 @@ func TestHeartbeatExporter(t *testing.T) {
 	mdp := NewMockHeartbeatMetadataProvider()
 	store := storage.NewMemoryStorage()
 
-	controller := NewHeartbeatExportController(MockClusterId, MockApplicationName, MockVersion, store, mdp)
+	controller := NewHeartbeatExportController(MockApplicationName, MockClusterId, MockVersion, store, mdp)
 
 	if !controller.Start(time.Second) {
 		t.Fatal("Failed to start controller")

+ 20 - 1
core/pkg/nodestats/nodestats.go

@@ -7,6 +7,7 @@ import (
 	"io"
 	"net/http"
 	"os"
+	"sync"
 
 	"github.com/opencost/opencost/core/pkg/clustercache"
 	"github.com/opencost/opencost/core/pkg/log"
@@ -60,6 +61,9 @@ func (nssc *NodeStatsSummaryClient) GetNodeData() ([]*stats.Summary, error) {
 	size := nssc.config.ConcurrentPollers
 	nodes := getReadyNodes(nssc.cache)
 
+	var errLock sync.Mutex
+	var errs []error
+
 	work := func(n *clustercache.Node) *stats.Summary {
 		if n.SpecProviderID == "" {
 			log.Warnf("node ProviderID not set, skipping for %s", n.Name)
@@ -70,12 +74,20 @@ func (nssc *NodeStatsSummaryClient) GetNodeData() ([]*stats.Summary, error) {
 
 		resp, err := requestNodeData(connections, nssc.endpoint, bearerToken)
 		if err != nil {
+			errLock.Lock()
+			errs = append(errs, err)
+			errLock.Unlock()
+
 			log.Warnf("error retrieving node data: %s", err)
 			return nil
 		}
 
 		data, err := nodeResponseToStatSummary(resp)
 		if err != nil {
+			errLock.Lock()
+			errs = append(errs, err)
+			errLock.Unlock()
+
 			log.Warnf("error converting node data: %s", err)
 			return nil
 		}
@@ -83,7 +95,14 @@ func (nssc *NodeStatsSummaryClient) GetNodeData() ([]*stats.Summary, error) {
 		return data
 	}
 
-	return worker.ConcurrentCollectWith(size, work, nodes), nil
+	results := worker.ConcurrentCollectWith(size, work, nodes)
+
+	// no need to lock, as the concurrent collect blocks until all complete
+	var err error = nil
+	if len(errs) > 0 {
+		err = errors.Join(errs...)
+	}
+	return results, err
 }
 
 // connectionOptions returns the connection methods that are allowed for this node based on config

+ 5 - 0
core/pkg/opencost/assetprops.go

@@ -190,6 +190,9 @@ const OracleProvider = "Oracle"
 // OTCProvider describes the provider OTC
 const OTCProvider = "OTC"
 
+// DigitalOceanProvider describes the provider DigitalOcean
+const DigitalOceanProvider = "DigitalOcean"
+
 // NilProvider describes unknown provider
 const NilProvider = "-"
 
@@ -210,6 +213,8 @@ func ParseProvider(str string) string {
 		return ScalewayProvider
 	case "oci", "oracle":
 		return OracleProvider
+	case "digitalocean", "doks", "do":
+		return DigitalOceanProvider
 	default:
 		return NilProvider
 	}

+ 86 - 4
core/pkg/protocol/http.go

@@ -64,6 +64,7 @@ func (hp HTTPProtocol) NotImplemented(message string) HTTPError {
 		Body:       message,
 	}
 }
+
 func (hp HTTPProtocol) Forbidden(message string) HTTPError {
 	if message == "" {
 		message = "Forbidden"
@@ -84,10 +85,11 @@ func (hp HTTPProtocol) NotFound() HTTPError {
 
 // HTTPResponse represents a data envelope for our HTTP messaging
 type HTTPResponse struct {
-	Code    int         `json:"code"`
-	Data    interface{} `json:"data"`
-	Message string      `json:"message,omitempty"`
-	Warning string      `json:"warning,omitempty"`
+	Code    int                    `json:"code"`
+	Data    interface{}            `json:"data"`
+	Meta    map[string]interface{} `json:"meta,omitempty"`
+	Message string                 `json:"message,omitempty"`
+	Warning string                 `json:"warning,omitempty"`
 }
 
 // ToResponse accepts a data payload and/or error to encode into a new HTTPResponse instance. Responses
@@ -106,6 +108,7 @@ func (hp HTTPProtocol) ToResponse(data interface{}, err error) *HTTPResponse {
 		Data: data,
 	}
 }
+
 func (hp HTTPProtocol) WriteRawOK(w http.ResponseWriter) {
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Content-Length", "0")
@@ -267,3 +270,82 @@ func (hp HTTPProtocol) WriteResponse(w http.ResponseWriter, r *HTTPResponse) {
 		w.Write([]byte(internalServerErrorJSON))
 	}
 }
+
+func (hp HTTPProtocol) NewError(err error, statusCode ...int) *HTTPError {
+	code := http.StatusInternalServerError
+	if len(statusCode) > 0 {
+		code = statusCode[0]
+	}
+
+	var body string
+	if err != nil {
+		body = err.Error()
+	} else {
+		body = "Internal Server Error"
+	}
+
+	return &HTTPError{
+		StatusCode: code,
+		Body:       body,
+	}
+}
+
+func (hp HTTPProtocol) NewResponse(code ...int) *HTTPResponse {
+	r := &HTTPResponse{Code: http.StatusOK}
+
+	if len(code) == 1 {
+		r.Code = code[0]
+	}
+
+	return r
+}
+
+func (r *HTTPResponse) WithCode(code int) *HTTPResponse {
+	if r == nil {
+		r = &HTTPResponse{}
+	}
+
+	r.Code = code
+
+	return r
+}
+
+func (r *HTTPResponse) WithData(data interface{}) *HTTPResponse {
+	if r == nil {
+		r = &HTTPResponse{}
+	}
+
+	r.Data = data
+
+	return r
+}
+
+func (r *HTTPResponse) WithMeta(meta map[string]interface{}) *HTTPResponse {
+	if r == nil {
+		r = &HTTPResponse{}
+	}
+
+	r.Meta = meta
+
+	return r
+}
+
+func (r *HTTPResponse) WithMessage(message string) *HTTPResponse {
+	if r == nil {
+		r = &HTTPResponse{}
+	}
+
+	r.Message = message
+
+	return r
+}
+
+func (r *HTTPResponse) WithWarning(warning string) *HTTPResponse {
+	if r == nil {
+		r = &HTTPResponse{}
+	}
+
+	r.Warning = warning
+
+	return r
+}

+ 109 - 1
core/pkg/protocol/http_test.go

@@ -6,6 +6,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 )
@@ -184,4 +185,111 @@ func TestHTTPProtocol_WriteData_Structure(t *testing.T) {
 	assert.NotContains(t, body, "message")
 	assert.NotContains(t, body, "warning")
 }
- 
+
+func TestHTTPProtocol_HTTPResponse(t *testing.T) {
+	proto := HTTP()
+
+	// Test data, meta, warning, message
+	data := struct {
+		Apples  int
+		Bananas int
+	}{
+		Apples:  12,
+		Bananas: 4,
+	}
+
+	meta := map[string]interface{}{
+		"lastUpdated": time.Date(2025, time.September, 5, 13, 27, 3, 0, time.UTC),
+	}
+
+	warning := "warning"
+
+	message := "message"
+
+	// Test building an HTTPResponse
+
+	var r *HTTPResponse
+
+	r = proto.NewResponse()
+	if r == nil || r.Code != 200 {
+		t.Errorf("expected code %d, received %d", 200, r.Code)
+	}
+
+	r = proto.NewResponse(302).WithMessage("Moved Temporarily")
+	if r == nil || r.Code != 302 || r.Message != "Moved Temporarily" {
+		t.Errorf("expected %d %s, received %d %s", 302, "Moved Temporarily", r.Code, r.Message)
+	}
+
+	r = r.WithCode(200).WithData(data).WithMeta(meta).WithMessage(message).WithWarning(warning)
+	if r == nil {
+		t.Errorf("unexpected nil response")
+	}
+	if r.Code != 200 {
+		t.Errorf("expected code %d, received %d", 200, r.Code)
+	}
+	if r.Data == nil {
+		t.Error("unexpected nil data")
+	}
+	if r.Meta == nil {
+		t.Error("unexpected nil meta")
+	}
+	if r.Message == "" {
+		t.Error("unexpected empty message")
+	}
+	if r.Warning == "" {
+		t.Error("unexpected empty warning")
+	}
+
+	// Test assigning attribtues to nil response
+
+	r = nil
+	r = r.WithCode(413)
+	if r == nil || r.Code != 413 {
+		t.Errorf("expected code %d, received %d", 413, r.Code)
+	}
+
+	r = nil
+	r = r.WithData(data)
+	if r == nil || r.Data == nil {
+		t.Error("expected data, received nil")
+	}
+
+	r = nil
+	r = r.WithMeta(meta)
+	if r == nil || r.Meta == nil {
+		t.Error("expected meta, received nil")
+	}
+
+	r = nil
+	r = r.WithWarning(warning)
+	if r == nil || r.Warning == "" {
+		t.Error("expected warning, received empty string")
+	}
+
+	r = nil
+	r = r.WithMessage(message)
+	if r == nil || r.Message == "" {
+		t.Error("expected message, received empty string")
+	}
+}
+
+func TestHTTPProtocol_NewError(t *testing.T) {
+	proto := HTTP()
+
+	err := errors.New("error")
+
+	httpErr := proto.NewError(err)
+	if httpErr == nil || httpErr.StatusCode != 500 || httpErr.Body != "error" {
+		t.Errorf("expected 500 error, received %d %s", httpErr.StatusCode, httpErr.Body)
+	}
+
+	httpErr = proto.NewError(err, 400)
+	if httpErr == nil || httpErr.StatusCode != 400 || httpErr.Body != "error" {
+		t.Errorf("expected 400 error, received %d %s", httpErr.StatusCode, httpErr.Body)
+	}
+
+	httpErr = proto.NewError(err, 400, 404)
+	if httpErr == nil || httpErr.StatusCode != 400 || httpErr.Body != "error" {
+		t.Errorf("expected 400 error, received %d %s", httpErr.StatusCode, httpErr.Body)
+	}
+}

+ 25 - 0
core/pkg/storage/bucketstorage_test.go

@@ -92,3 +92,28 @@ func TestBucketStorage_Stat(t *testing.T) {
 
 	TestStorageStat(t, store)
 }
+
+// We should be able to call validate function with and without write and delete check without any errors
+func TestBucketStorage_Validate(t *testing.T) {
+	configPath := os.Getenv("TEST_BUCKET_CONFIG")
+	if configPath == "" {
+		t.Skip("skipping integration test, set environment variable TEST_BUCKET_CONFIG")
+	}
+	store, err := createStorage(configPath)
+	if err != nil {
+		t.Errorf("failed to create storage: %s", err.Error())
+		return
+	}
+
+	// Validate BucketStorage with write and delete check
+	err = Validate(store, true)
+	if err != nil {
+		t.Errorf("failed to validate storage: %s", err.Error())
+	}
+
+	// Validate BucketStorage without write and delete check (should not fail)
+	err = Validate(store, false)
+	if err != nil {
+		t.Errorf("failed to validate storage: %s", err.Error())
+	}
+}

+ 21 - 13
core/pkg/storage/storage.go

@@ -2,6 +2,7 @@ package storage
 
 import (
 	"os"
+	"strings"
 	"time"
 
 	"github.com/opencost/opencost/core/pkg/log"
@@ -60,7 +61,7 @@ type Storage interface {
 }
 
 // Validate uses the provided storage implementation to write a test file to the store, followed by a removal.
-func Validate(storage Storage) error {
+func Validate(storage Storage, validateWriteDelete bool) error {
 	const testPath = "tmp/test.txt"
 	const testContent = "test"
 
@@ -78,25 +79,32 @@ func Validate(storage Storage) error {
 		return errors.Wrap(err, "Failed to list path")
 	}
 
-	// attempt to write a path
-	err = storage.Write(testPath, []byte(testContent))
-	if err != nil {
-		return errors.Wrap(err, "Failed to write data to storage")
+	if validateWriteDelete {
+		// attempt to write a path
+		err = storage.Write(testPath, []byte(testContent))
+		if err != nil {
+			return errors.Wrap(err, "Failed to write data to storage")
+		}
 	}
 
 	// attempt to read the path
+	// If we are not validating write and delete, the file won't exist since we never wrote it.
+	// We only want to check read permissions, so ignore errors with "exist" and "404" in the error message to bypass the file not exist error.
 	data, err := storage.Read(testPath)
-	if err != nil {
+	if err != nil && !strings.Contains(err.Error(), "exist") && !strings.Contains(err.Error(), "404") {
 		return errors.Wrap(err, "Failed to read data from storage")
 	}
-	if string(data) != testContent {
-		return errors.New("Failed to read the expected data from storage")
-	}
 
-	// delete the path
-	err = storage.Remove(testPath)
-	if err != nil {
-		return errors.Wrap(err, "Failed to remove data from storage")
+	if validateWriteDelete {
+		if string(data) != testContent {
+			return errors.New("Failed to read the expected data from storage")
+		}
+
+		// delete the path
+		err = storage.Remove(testPath)
+		if err != nil {
+			return errors.Wrap(err, "Failed to remove data from storage")
+		}
 	}
 
 	return nil

+ 1 - 0
modules/collector-source/go.mod

@@ -6,6 +6,7 @@ go 1.24.2
 
 require (
 	github.com/julienschmidt/httprouter v1.3.0
+	github.com/kubecost/events v0.0.8
 	github.com/opencost/opencost/core v0.0.0-20250521155634-81d2b597d1bc
 	golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0
 	k8s.io/api v0.33.1

+ 2 - 2
modules/collector-source/go.sum

@@ -191,8 +191,6 @@ github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PU
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
-github.com/golang-jwt/jwt/v5 v5.2.1 h1:OuVbFODueb089Lh128TAcimifWaLhJwVflnrgM17wHk=
-github.com/golang-jwt/jwt/v5 v5.2.1/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
 github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
 github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
@@ -331,6 +329,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/kubecost/events v0.0.8 h1:FEglMSOGkjiSZT2FnSYM99s2M4DMiBOgHVheM7Vnurs=
+github.com/kubecost/events v0.0.8/go.mod h1:PXnE7CSZs3OulOLcB8baQENploBp4NM7ERZVBCqNi4A=
 github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
 github.com/magiconair/properties v1.8.5 h1:b6kJs+EmPFMYGkow9GiUyCyOvIwYetYJ3fSaWak/Gls=

+ 12 - 10
modules/collector-source/pkg/collector/config.go

@@ -7,10 +7,11 @@ import (
 )
 
 type CollectorConfig struct {
-	Resolutions    []util.ResolutionConfiguration `json:"resolutions"`
-	ScrapeInterval string                         `json:"scrape_interval"`
-	ClusterID      string                         `json:"cluster_id"`
-	NetworkPort    int                            `json:"network_port"`
+	Resolutions     []util.ResolutionConfiguration `json:"resolutions"`
+	ScrapeInterval  string                         `json:"scrape_interval"`
+	ClusterID       string                         `json:"cluster_id"`
+	ApplicationName string                         `json:"application_name"`
+	NetworkPort     int                            `json:"network_port"`
 }
 
 func NewOpenCostCollectorConfigFromEnv() CollectorConfig {
@@ -18,19 +19,20 @@ func NewOpenCostCollectorConfigFromEnv() CollectorConfig {
 		Resolutions: []util.ResolutionConfiguration{
 			{
 				Interval:  "10m",
-				Retention: env.GetCollector10mResolutionRetention(),
+				Retention: env.GetCollectorResolution10mRetention(),
 			},
 			{
 				Interval:  "1h",
-				Retention: env.GetCollector1hResolutionRetention(),
+				Retention: env.GetCollectorResolution1hRetention(),
 			},
 			{
 				Interval:  "1d",
-				Retention: env.GetCollection1dResolutionRetention(),
+				Retention: env.GetCollectionResolution1dRetention(),
 			},
 		},
-		ScrapeInterval: env.GetCollectorScrapeIntervalSeconds(),
-		ClusterID:      coreenv.GetClusterID(),
-		NetworkPort:    env.GetNetworkPort(),
+		ScrapeInterval:  env.GetCollectorScrapeIntervalSeconds(),
+		ClusterID:       coreenv.GetClusterID(),
+		ApplicationName: coreenv.GetAppName(),
+		NetworkPort:     env.GetNetworkPort(),
 	}
 }

+ 15 - 13
modules/collector-source/pkg/collector/datasource.go

@@ -18,11 +18,11 @@ import (
 )
 
 type collectorDataSource struct {
-	metricsQuerier *collectorMetricsQuerier
-	clusterMap     clusters.ClusterMap
-	clusterInfo    clusters.ClusterInfoProvider
-	config         CollectorConfig
-	diagnosticsModule    *metric.DiagnosticsModule
+	metricsQuerier    *collectorMetricsQuerier
+	clusterMap        clusters.ClusterMap
+	clusterInfo       clusters.ClusterInfoProvider
+	config            CollectorConfig
+	diagnosticsModule *metric.DiagnosticsModule
 }
 
 func NewDefaultCollectorDataSource(
@@ -67,6 +67,7 @@ func NewCollectorDataSource(
 	if store != nil {
 		wal, err := metric.NewWalinator(
 			config.ClusterID,
+			config.ApplicationName,
 			store,
 			resolutions,
 			updater,
@@ -79,8 +80,7 @@ func NewCollectorDataSource(
 		}
 	}
 
-	diagnosticsModule := metric.NewDiagnosticsModule(updater)
-	updater = diagnosticsModule
+	diagnosticsModule := metric.NewDiagnosticsModule()
 	scrapeController := scrape.NewScrapeController(
 		config.ScrapeInterval,
 		config.NetworkPort,
@@ -98,11 +98,11 @@ func NewCollectorDataSource(
 	clusterMap := newCollectorClusterMap(clusterInfo)
 
 	return &collectorDataSource{
-		config:         config,
-		metricsQuerier: metricQuerier,
-		clusterInfo:    clusterInfo,
-		clusterMap:     clusterMap,
-		diagnosticsModule:    diagnosticsModule,
+		config:            config,
+		metricsQuerier:    metricQuerier,
+		clusterInfo:       clusterInfo,
+		clusterMap:        clusterMap,
+		diagnosticsModule: diagnosticsModule,
 	}
 }
 
@@ -112,9 +112,11 @@ func (c *collectorDataSource) RegisterEndPoints(router *httprouter.Router) {
 
 func (c *collectorDataSource) RegisterDiagnostics(diagService diagnostics.DiagnosticService) {
 	const CollectorDiagnosticCategory = "collector"
+
 	diagnosticDefinitions := c.diagnosticsModule.DiagnosticsDefinitions()
+
 	for _, dd := range diagnosticDefinitions {
-		err := diagService.Register(dd.ID, dd.Description, CollectorDiagnosticCategory, func(ctx context.Context) (map[string]any, error) {
+		err := diagService.Register(dd.MetricName, dd.Description, CollectorDiagnosticCategory, func(ctx context.Context) (map[string]any, error) {
 			details, err := c.diagnosticsModule.DiagnosticsDetails(dd.ID)
 			if err != nil {
 				return nil, err

+ 9 - 11
modules/collector-source/pkg/env/collectorenv.go

@@ -5,27 +5,25 @@ import (
 )
 
 const (
-	NetworkPortEnvVar               = "NETWORK_PORT"
-	Collector10mResolutionRetention = "COLLECTOR_10M_RESOLUTION_RETENTION"
-	Collector1hResolutionRetention  = "COLLECTOR_1H_RESOLUTION_RETENTION"
-	Collection1dResolutionRetention = "COLLECTOR_1D_RESOLUTION_RETENTION"
-	CollectorScrapeInterval         = "COLLECTOR_SCRAPE_INTERVAL"
+	CollectorEnvVarPrefix   = "COLLECTOR_"
+	CollectorScrapeInterval = "COLLECTOR_SCRAPE_INTERVAL"
+	NetworkPortEnvVar       = "NETWORK_PORT"
 )
 
 func GetNetworkPort() int {
 	return env.GetInt(NetworkPortEnvVar, 3001)
 }
 
-func GetCollector10mResolutionRetention() int {
-	return env.GetInt(Collector10mResolutionRetention, 36)
+func GetCollectorResolution10mRetention() int {
+	return env.GetPrefixInt(CollectorEnvVarPrefix, env.Resolution10mRetentionEnvVar, 36)
 }
 
-func GetCollector1hResolutionRetention() int {
-	return env.GetInt(Collector1hResolutionRetention, 49)
+func GetCollectorResolution1hRetention() int {
+	return env.GetPrefixInt(CollectorEnvVarPrefix, env.Resolution1hRetentionEnvVar, 49)
 }
 
-func GetCollection1dResolutionRetention() int {
-	return env.GetInt(Collection1dResolutionRetention, 15)
+func GetCollectionResolution1dRetention() int {
+	return env.GetPrefixInt(CollectorEnvVarPrefix, env.Resolution1dRetentionEnvVar, 15)
 }
 
 func GetCollectorScrapeIntervalSeconds() string {

+ 38 - 0
modules/collector-source/pkg/event/scrape.go

@@ -0,0 +1,38 @@
+package event
+
+const (
+	DCGMScraperName              = "dcgm-metrics"
+	OpenCostScraperName          = "opencost-metrics"
+	NodeStatsScraperName         = "nodestats-metrics"
+	NetworkCostsScraperName      = "network-costs-metrics"
+	KubernetesClusterScraperName = "kubernetes-metrics"
+)
+
+const (
+	NodeScraperType        = "nodes"
+	NamespaceScraperType   = "namespaces"
+	ReplicaSetScraperType  = "replicasets"
+	DeploymentScraperType  = "deployments"
+	StatefulSetScraperType = "statefulsets"
+	ServiceScraperType     = "services"
+	PodScraperType         = "pods"
+	PvScraperType          = "pvs"
+	PvcScraperType         = "pvcs"
+)
+
+// ScrapeEvent is dispatched when a scrape is performed over a set of targets. It contains the name
+// of the scraper performing the scrape, the total number of targets, and any errors encountered.
+type ScrapeEvent struct {
+	// The name of the actual Scraper implementation performing the target scrapes.
+	ScraperName string
+
+	// The type of scrape being performed. For example, if a scraper performs multiple scrapes
+	// for different resources, this field can be used to distinguish between them.
+	ScrapeType string
+
+	// The total number of targets being accessed by the scraper.
+	Targets int
+
+	// Any errors that occurred during the scrape.
+	Errors []error
+}

+ 295 - 92
modules/collector-source/pkg/metric/diagnostics.go

@@ -2,39 +2,57 @@ package metric
 
 import (
 	"fmt"
-	"maps"
 	"sync"
+
+	"github.com/kubecost/events"
+	"github.com/opencost/opencost/core/pkg/collections"
+	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/opencost/opencost/core/pkg/util/sliceutil"
+	"github.com/opencost/opencost/modules/collector-source/pkg/event"
 )
 
 // Collector Metric Diagnostic IDs
 const (
-	// KubecostDiagnosticMetricID is the identifier for the metric used to determine if Kubecost metrics are being scraped.
-	KubecostDiagnosticMetricID = "kubecostMetric"
+	// DcgmScraperDiagnosticID contains the identifier for the the DCGM scraper diagnostic.
+	DcgmScraperDiagnosticID = event.DCGMScraperName
 
-	// NodesDiagnosticMetricID is the identifier for the query used to determine if the node CPU cores capacity is being scraped
-	NodesDiagnosticMetricID = "nodesCPUMetrics"
-)
+	// OpenCostScraperDiagnosticID contains the identifier for the the opencost metrics scraper diagnostic
+	OpenCostScraperDiagnosticID = event.OpenCostScraperName
 
-// diagnostic definitions mapping holds all of the diagnostic definitions that can be used for collector metrics diagnostics
-var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnosticDefinition{
-	NodesDiagnosticMetricID: {
-		ID:          NodesDiagnosticMetricID,
-		MetricName:  KubeNodeStatusCapacityCPUCores,
-		Label:       "Node CPU cores capacity is being scraped",
-		Description: "Determine if the node CPU cores capacity is being scraped",
-	},
+	// NodeStatsScraperDiagnosticID contains the identifier for the the node stats summary scraper diagnostic
+	NodeStatsScraperDiagnosticID = event.NodeStatsScraperName
 
-	KubecostDiagnosticMetricID: {
-		ID:          KubecostDiagnosticMetricID,
-		MetricName:  NodeTotalHourlyCost,
-		Label:       "Kubecost metrics for a node are being scraped",
-		Description: "Determine if kubecost metrics for a node are being scraped",
-	},
-}
+	// NetworkCostsScraperDiagnosticID contains the identifier for the the network-costs scraper diagnostic.
+	NetworkCostsScraperDiagnosticID = event.NetworkCostsScraperName
 
-// diagnosticsResults stores the current state of diagnostic results
-var diagnosticsResults map[string]*diagnosticsResult = make(map[string]*diagnosticsResult)
+	// Kubernetes scrapers contains the identifiers for all the specific KubernetesCluster scrapers.
+	KubernetesNodesScraperDiagnosticID        = event.KubernetesClusterScraperName + "-" + event.NodeScraperType
+	KubernetesNamespacesScraperDiagnosticID   = event.KubernetesClusterScraperName + "-" + event.NamespaceScraperType
+	KubernetesReplicaSetsScraperDiagnosticID  = event.KubernetesClusterScraperName + "-" + event.ReplicaSetScraperType
+	KubernetesDeploymentsScraperDiagnosticID  = event.KubernetesClusterScraperName + "-" + event.DeploymentScraperType
+	KubernetesStatefulSetsScraperDiagnosticID = event.KubernetesClusterScraperName + "-" + event.StatefulSetScraperType
+	KubernetesServicesScraperDiagnosticID     = event.KubernetesClusterScraperName + "-" + event.ServiceScraperType
+	KubernetesPodsScraperDiagnosticID         = event.KubernetesClusterScraperName + "-" + event.PodScraperType
+	KubernetesPvsScraperDiagnosticID          = event.KubernetesClusterScraperName + "-" + event.PvScraperType
+	KubernetesPvcsScraperDiagnosticID         = event.KubernetesClusterScraperName + "-" + event.PvcScraperType
 
+	// Metric Names for the diagnostics (used in the UI)
+	DGGMScraperDiagnosticMetricName                   = "DCGM Metrics"
+	OpenCostScraperDiagnosticMetricName               = "Opencost Metrics"
+	NodeStatsScraperDiagnosticMetricName              = "Node Stats Metrics"
+	NetworkCostsScraperDiagnosticMetricName           = "Network Costs Metrics"
+	KubernetesNodesScraperDiagnosticMetricName        = "Kubernetes Nodes Metrics"
+	KubernetesNamespacesScraperDiagnosticMetricName   = "Kubernetes Namespaces Metrics"
+	KubernetesReplicaSetsScraperDiagnosticMetricName  = "Kubernetes Replica Sets Metrics"
+	KubernetesDeploymentsScraperDiagnosticMetricName  = "Kubernetes Deployments Metrics"
+	KubernetesStatefulSetsScraperDiagnosticMetricName = "Kubernetes Stateful Sets Metrics"
+	KubernetesServicesScraperDiagnosticMetricName     = "Kubernetes Services Metrics"
+	KubernetesPodsScraperDiagnosticMetricName         = "Kubernetes Pods Metrics"
+	KubernetesPvsScraperDiagnosticMetricName          = "Kubernetes PVs Metrics"
+	KubernetesPvcsScraperDiagnosticMetricName         = "Kubernetes PVCs Metrics"
+)
+
+// diagnostic defintion is the type used to define a deterministic list of specific diagnostics we _expect_ to collect
 type diagnosticDefinition struct {
 	ID          string
 	MetricName  string
@@ -43,100 +61,285 @@ type diagnosticDefinition struct {
 	DocLink     string
 }
 
-type diagnosticsResult struct {
-	Result map[string]any
-	Passed bool
+// diagnostic definitions mapping holds all of the diagnostic definitions that can be used for collector metrics diagnostics
+var diagnosticDefinitions map[string]*diagnosticDefinition = map[string]*diagnosticDefinition{
+	DcgmScraperDiagnosticID: {
+		ID:          DcgmScraperDiagnosticID,
+		MetricName:  DGGMScraperDiagnosticMetricName,
+		Label:       "DCGM scraper is available and is being scraped.",
+		Description: scraperDiagnosticDescriptionFor(event.DCGMScraperName, ""),
+	},
+
+	OpenCostScraperDiagnosticID: {
+		ID:          OpenCostScraperDiagnosticID,
+		MetricName:  OpenCostScraperDiagnosticMetricName,
+		Label:       "Opencost metrics scraper is available and is being scraped.",
+		Description: scraperDiagnosticDescriptionFor(event.OpenCostScraperName, ""),
+	},
+
+	NodeStatsScraperDiagnosticID: {
+		ID:          NodeStatsScraperDiagnosticID,
+		MetricName:  NodeStatsScraperDiagnosticMetricName,
+		Label:       "Node stats summary scraper is available and is being scraped.",
+		Description: scraperDiagnosticDescriptionFor(event.NodeStatsScraperName, ""),
+	},
+
+	NetworkCostsScraperDiagnosticID: {
+		ID:          NetworkCostsScraperDiagnosticID,
+		MetricName:  NetworkCostsScraperDiagnosticMetricName,
+		Label:       "Network costs daemonset metrics scrapers are available and being scraped.",
+		Description: scraperDiagnosticDescriptionFor(event.NetworkCostsScraperName, ""),
+	},
+
+	KubernetesNodesScraperDiagnosticID: {
+		ID:          KubernetesNodesScraperDiagnosticID,
+		MetricName:  KubernetesNodesScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.NodeScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.NodeScraperType),
+	},
+
+	KubernetesNamespacesScraperDiagnosticID: {
+		ID:          KubernetesNamespacesScraperDiagnosticID,
+		MetricName:  KubernetesNamespacesScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.NamespaceScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.NamespaceScraperType),
+	},
+
+	KubernetesReplicaSetsScraperDiagnosticID: {
+		ID:          KubernetesReplicaSetsScraperDiagnosticID,
+		MetricName:  KubernetesReplicaSetsScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.ReplicaSetScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.ReplicaSetScraperType),
+	},
+
+	KubernetesDeploymentsScraperDiagnosticID: {
+		ID:          KubernetesDeploymentsScraperDiagnosticID,
+		MetricName:  KubernetesDeploymentsScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.DeploymentScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.DeploymentScraperType),
+	},
+
+	KubernetesStatefulSetsScraperDiagnosticID: {
+		ID:          KubernetesStatefulSetsScraperDiagnosticID,
+		MetricName:  KubernetesStatefulSetsScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.StatefulSetScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.StatefulSetScraperType),
+	},
+
+	KubernetesServicesScraperDiagnosticID: {
+		ID:          KubernetesServicesScraperDiagnosticID,
+		MetricName:  KubernetesServicesScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.ServiceScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.ServiceScraperType),
+	},
+
+	KubernetesPodsScraperDiagnosticID: {
+		ID:          KubernetesPodsScraperDiagnosticID,
+		MetricName:  KubernetesPodsScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.PodScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.PodScraperType),
+	},
+
+	KubernetesPvsScraperDiagnosticID: {
+		ID:          KubernetesPvsScraperDiagnosticID,
+		MetricName:  KubernetesPvsScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.PvScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.PvScraperType),
+	},
+
+	KubernetesPvcsScraperDiagnosticID: {
+		ID:          KubernetesPvcsScraperDiagnosticID,
+		MetricName:  KubernetesPvcsScraperDiagnosticMetricName,
+		Label:       fmt.Sprintf("Kubernetes cluster resources: %s are available and being scraped", event.PvcScraperType),
+		Description: scraperDiagnosticDescriptionFor(event.KubernetesClusterScraperName, event.PvcScraperType),
+	},
 }
 
-type DiagnosticsModule struct {
-	lock    sync.RWMutex
-	updater Updater
+// scraper identifier for diagnostic mapping _must_ match diagnostic ids defined above
+func scraperIdFor(scraperName, scrapeType string) string {
+	if scrapeType == "" {
+		return scraperName
+	}
+	return fmt.Sprintf("%s-%s", scraperName, scrapeType)
 }
 
-func NewDiagnosticsModule(updater Updater) *DiagnosticsModule {
-	// Initialize diagnostics results to false to represent that no data has been collected yet
-	for id := range diagnosticDefinitions {
-		diagnosticsResults[id] = &diagnosticsResult{
-			Result: make(map[string]any),
-			Passed: false,
-		}
+// helper for generating dynamic scraper events diagnostic descriptions
+func scraperDiagnosticDescriptionFor(scraperName, scrapeType string) string {
+	if scrapeType == "" {
+		return fmt.Sprintf("Determine if the scraper for: %s is correctly reporting data", scraperName)
+	}
+	return fmt.Sprintf("Determine if the scraper for: %s is correctly report data for type: %s", scraperName, scrapeType)
+}
+
+// CollectorDiagnostic is a basic interface used to allow various types of diagnostic data collection
+type CollectorDiagnostic interface {
+	// Id returns the identifier for the diagnostic
+	Id() string
+
+	// Name returns the name of the metric being run
+	Name() string
+
+	// Details generates an exportable detail map for the specific diagnostic, and resets any of its internal
+	// state for the current cycle.
+	Details() map[string]any
+}
+
+// scrapeDiagnostic maintains the latest state of each scrape event that occurs. scrape
+// events can be registered for any event, but only the specific scrapes with diagnostic
+// definitions defined will export as diagnostics.
+type scrapeDiagnostic struct {
+	diagnostic *diagnosticDefinition
+	scraper    string
+	scrapeType string
+	targets    int
+	errors     []error
+}
+
+// creates a new scrape diagnostic from the event data and diagnostics definition
+func newScrapeDiagnostic(
+	scrapeEvent event.ScrapeEvent,
+	definition *diagnosticDefinition,
+) *scrapeDiagnostic {
+	return &scrapeDiagnostic{
+		diagnostic: definition,
+		scraper:    scrapeEvent.ScraperName,
+		scrapeType: scrapeEvent.ScrapeType,
+		targets:    scrapeEvent.Targets,
+		errors:     scrapeEvent.Errors,
+	}
+}
+
+// Id is a concatenation of scraper and scrapeType if a scrapeType exists.
+func (sd *scrapeDiagnostic) Id() string {
+	if sd.diagnostic != nil {
+		return sd.diagnostic.ID
+	}
+	return scraperIdFor(sd.scraper, sd.scrapeType)
+}
+
+// Name returns the name of the scraper the event fired from.
+func (sd *scrapeDiagnostic) Name() string {
+	if sd.diagnostic != nil {
+		return sd.diagnostic.MetricName
+	}
+	return scraperIdFor(sd.scraper, sd.scrapeType)
+}
+
+// Details generates an exportable detail map for the specific diagnostic, and resets any of its internal
+// state for the current cycle.
+func (sd *scrapeDiagnostic) Details() map[string]any {
+	// passed if there are no errors
+	passed := len(sd.errors) == 0
+
+	// map errors to a string slice for easier propagation
+	var errs []string
+	if !passed {
+		errs = sliceutil.Map(sd.errors, func(e error) string { return e.Error() })
+	} else {
+		errs = []string{}
 	}
 
-	return &DiagnosticsModule{
-		updater: updater,
+	// since a scrape event does not require a matching diagnostic definition,
+	// we must generate properties normally extracted from the defintiion
+	var label string
+	if sd.diagnostic != nil {
+		label = sd.diagnostic.Label
+	} else {
+		label = fmt.Sprintf("%s scraper is available and being scraped.", sd.scraper)
 	}
+
+	// same for doclink
+	var docLink string
+	if sd.diagnostic != nil {
+		docLink = sd.diagnostic.DocLink
+	} else {
+		docLink = ""
+	}
+
+	details := map[string]any{
+		// stats contains total entities to scrape, success (of the total), and failures (of the total)
+		"stats": map[string]any{
+			"total":   sd.targets,
+			"success": max(sd.targets-len(errs), 0),
+			"fail":    len(errs),
+		},
+		"label":   label,
+		"docLink": docLink,
+		"errors":  errs,
+		"passed":  passed,
+	}
+
+	// scraper diagnostics do not maintain any internal/historical state
+	// to reset -- it just maintains the most recent data. if we decide
+	// to track historical event data, would need to reset the state after
+	// this call.
+
+	return details
+}
+
+// DiagnosticsModule is a helper type for managing all of the internal diagnostics for the collector datasource.
+type DiagnosticsModule struct {
+	lock            sync.RWMutex
+	diagnostics     *collections.IdNameMap[CollectorDiagnostic]
+	scrapeHandlerId events.HandlerID // scrape event handler identifier for removal
 }
 
-func (d *DiagnosticsModule) Update(updateSet *UpdateSet) {
-	if updateSet == nil {
+// NewDiagnosticsModule creates a new `DiagnosticsModule` instance to be used with a collector data source
+func NewDiagnosticsModule() *DiagnosticsModule {
+	diagnostics := collections.NewIdNameMap[CollectorDiagnostic]()
+	dm := &DiagnosticsModule{
+		diagnostics: diagnostics,
+	}
+
+	scrapeEvents := events.GlobalDispatcherFor[event.ScrapeEvent]()
+	dm.scrapeHandlerId = scrapeEvents.AddEventHandler(dm.onScrapeEvent)
+
+	return dm
+}
+
+// handles a scrape event dispatched -- updates the record for the specific scrape
+// diagnostic.
+func (d *DiagnosticsModule) onScrapeEvent(event event.ScrapeEvent) {
+	d.lock.Lock()
+	defer d.lock.Unlock()
+
+	id := scraperIdFor(event.ScraperName, event.ScrapeType)
+
+	// scrape events can occur without a backing diagnostic definition -- just
+	// ignore if this happens
+	def, ok := diagnosticDefinitions[id]
+	if !ok {
 		return
 	}
 
-	// Create a deep copy for the async update to avoid race condition
-	updateSetCopy := &UpdateSet{
-		Timestamp: updateSet.Timestamp,
-		Updates:   make([]Update, len(updateSet.Updates)),
+	err := d.diagnostics.Insert(newScrapeDiagnostic(event, def))
+	if err != nil {
+		log.Errorf("failed to insert scrape diagnostic: %s", err)
 	}
-	copy(updateSetCopy.Updates, updateSet.Updates)
-
-	// This is done so that the update func is marked complete when both the updater and diagnostics are done
-	// Otherwise we might face a race condition when calling the diagnostics details func before the diagnostics are done
-	var wg sync.WaitGroup
-	wg.Add(2) // 1 for updater, 1 for diagnostics
-
-	go func() {
-		defer wg.Done()
-		d.lock.Lock()
-		defer d.lock.Unlock()
-
-		timestamp := updateSet.Timestamp.String()
-		for id, dd := range diagnosticDefinitions {
-			for _, update := range updateSet.Updates {
-				if update.Name == dd.MetricName {
-					if len(diagnosticsResults[id].Result) == 0 {
-						// For the first UpdateSet received for that metric, we default to true. If we later miss the metric for a timestamp, it will be set to false.
-						diagnosticsResults[id].Passed = true
-					}
-					diagnosticsResults[id].Result[timestamp] = update.Value
-				}
-			}
-			if diagnosticsResults[id].Result[timestamp] == nil {
-				diagnosticsResults[id].Passed = false
-			}
-		}
-	}()
-
-	// We are still maintaining the order in which the updates to the repo are called
-	// as this function gets the new call only when both these go routines are done
-	go func() {
-		defer wg.Done()
-		d.updater.Update(updateSetCopy)
-	}()
-
-	wg.Wait()
 }
 
+// DiagnosticDefinitions returns a deterministic mapping of pre-defined diagnostics used with the collector.
 func (d *DiagnosticsModule) DiagnosticsDefinitions() map[string]*diagnosticDefinition {
 	return diagnosticDefinitions
 }
 
+// DiagnosticDetails returns the latest details for the diagnostic type
 func (d *DiagnosticsModule) DiagnosticsDetails(diagnosticsId string) (map[string]any, error) {
 	d.lock.RLock()
 	defer d.lock.RUnlock()
 
+	// If a bogus diagnostics id was passed, we can check the definitions first
 	if _, exists := diagnosticDefinitions[diagnosticsId]; !exists {
-		return nil, fmt.Errorf("diagnostic ID: %s not found", diagnosticsId)
+		return nil, fmt.Errorf("invalid diagnostic id: %s not found", diagnosticsId)
 	}
 
-	details := map[string]any{
-		"query":   diagnosticDefinitions[diagnosticsId].MetricName,
-		"label":   diagnosticDefinitions[diagnosticsId].Label,
-		"docLink": diagnosticDefinitions[diagnosticsId].DocLink,
-		"result":  maps.Clone(diagnosticsResults[diagnosticsId].Result),
-		"passed":  diagnosticsResults[diagnosticsId].Passed,
+	// for some diagnostics, like the scraper variant, they may not have been registered
+	// yet (no scrape events), so we should return an error indicating that the scrape
+	// hasn't occurred yet
+	diagnostic, exists := d.diagnostics.ById(diagnosticsId)
+	if !exists {
+		return nil, fmt.Errorf("diagnostic not available: %s", diagnosticsId)
 	}
-	// reset the result and passed for the next run
-	diagnosticsResults[diagnosticsId].Result = make(map[string]any)
-	diagnosticsResults[diagnosticsId].Passed = false
-	return details, nil
+
+	return diagnostic.Details(), nil
 }

+ 106 - 154
modules/collector-source/pkg/metric/diagnostics_test.go

@@ -1,194 +1,146 @@
 package metric
 
 import (
+	"fmt"
 	"testing"
 	"time"
-)
-
-// MockUpdater implements the Updater interface for testing
-type MockUpdater struct {
-}
 
-func (m *MockUpdater) Update(updateSet *UpdateSet) {
-}
+	"github.com/kubecost/events"
+	"github.com/opencost/opencost/modules/collector-source/pkg/event"
+)
 
-// Test Update func in DiagnosticsModule and check if diagnostics pass
-func TestDiagnosticsModule_Update(t *testing.T) {
-	mockUpdater := &MockUpdater{}
-	module := NewDiagnosticsModule(mockUpdater)
-
-	// Test with valid update set containing node metrics
-	timestamp := time.Now()
-	updateSet := &UpdateSet{
-		Timestamp: timestamp,
-		Updates: []Update{
-			{
-				Name:  KubeNodeStatusCapacityCPUCores,
-				Value: 4.0,
-			},
-			{
-				Name:  NodeTotalHourlyCost,
-				Value: 0.50,
-			},
+func TestDiagnosticsModule_ScrapeDiagnostics(t *testing.T) {
+	module := NewDiagnosticsModule()
+
+	// dispatch some faux scrape events
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.NetworkCostsScraperName,
+		Targets:     10,
+		Errors:      []error{},
+	})
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.NodeScraperType,
+		Targets:     8,
+		Errors: []error{
+			fmt.Errorf("failed to scrape node 'foo'"),
+			fmt.Errorf("failed to scrape node 'bar'"),
 		},
-	}
+	})
 
-	module.Update(updateSet)
+	time.Sleep(500 * time.Millisecond)
 
-	// Check both diagnostics
-	nodeDetails, err := module.DiagnosticsDetails(NodesDiagnosticMetricID)
+	networkDiagnosticDetails, err := module.DiagnosticsDetails(NetworkCostsScraperDiagnosticID)
 	if err != nil {
-		t.Error("Expected no error for valid diagnostic ID")
-	}
-	if nodeDetails["passed"] != true {
-		t.Error("Expected node diagnostic to pass")
+		t.Fatalf("unexpected error: %s", err)
+		return
 	}
 
-	kubecostDetails, err := module.DiagnosticsDetails(KubecostDiagnosticMetricID)
-	if err != nil {
-		t.Error("Expected no error for valid diagnostic ID")
+	stats := networkDiagnosticDetails["stats"].(map[string]any)
+	errors := networkDiagnosticDetails["errors"].([]string)
+	label := networkDiagnosticDetails["label"].(string)
+
+	statsTotal := stats["total"].(int)
+	statsSuccess := stats["success"].(int)
+	statsFail := stats["fail"].(int)
+
+	if statsTotal != 10 {
+		t.Fatalf("expected networkCostsDetails[\"stats\"][\"total\"] to equal 10, got: %d", statsTotal)
+		return
 	}
-	if kubecostDetails["passed"] != true {
-		t.Error("Expected kubecost diagnostic to pass")
+	if statsSuccess != 10 {
+		t.Fatalf("expected networkCostsDetails[\"stats\"][\"success\"] to equal 10, got: %d", statsSuccess)
+		return
 	}
-}
-
-// Test Update func in DiagnosticsModule with missing metrics and test if diagnostics fail
-func TestDiagnosticsModule_UpdateWithMissingMetrics(t *testing.T) {
-	mockUpdater := &MockUpdater{}
-	module := NewDiagnosticsModule(mockUpdater)
-
-	timestamp := time.Now()
-	updateSet := &UpdateSet{
-		Timestamp: timestamp,
-		Updates: []Update{
-			{
-				Name:  "some_other_metric",
-				Value: 1.0,
-			},
-		},
+	if statsFail != 0 {
+		t.Fatalf("expected networkCostsDetails[\"stats\"][\"fail\"] to equal 0, got: %d", statsFail)
+		return
 	}
 
-	module.Update(updateSet)
-
-	// Check that diagnostics fail when their metrics are missing
-	nodeDetails, err := module.DiagnosticsDetails(NodesDiagnosticMetricID)
-	if err != nil {
-		t.Error("Expected no error for valid diagnostic ID")
+	if len(errors) != 0 {
+		t.Fatalf("expected len(networkCostsDetails[\"errors\"]) to equal 0, got: %d", len(errors))
+		return
 	}
-	if nodeDetails["passed"] != false {
-		t.Error("Expected node diagnostic to fail when metric is missing")
+
+	if len(label) == 0 {
+		t.Fatalf("expected len(networkCostsDetails[\"label\"]) to be non-zero. Got 0.")
+		return
 	}
 
-	kubecostDetails, err := module.DiagnosticsDetails(KubecostDiagnosticMetricID)
+	nodeScrapeDetails, err := module.DiagnosticsDetails(KubernetesNodesScraperDiagnosticID)
 	if err != nil {
-		t.Error("Expected no error for valid diagnostic ID")
+		t.Fatalf("unexpected error: %s", err)
+		return
 	}
-	if kubecostDetails["passed"] != false {
-		t.Error("Expected kubecost diagnostic to fail when metric is missing")
-	}
-}
 
-// Test DiagnosticsDetails func in DiagnosticsModule with invalid and valid diagnostic IDs
-func TestDiagnosticsModule_DiagnosticsDetails(t *testing.T) {
-	mockUpdater := &MockUpdater{}
-	module := NewDiagnosticsModule(mockUpdater)
+	stats = nodeScrapeDetails["stats"].(map[string]any)
+	errors = nodeScrapeDetails["errors"].([]string)
+	label = nodeScrapeDetails["label"].(string)
 
-	// Test with invalid diagnostic ID
-	_, err := module.DiagnosticsDetails("invalid_id")
-	if err.Error() != "diagnostic ID: invalid_id not found" {
-		t.Error("Expected error for invalid diagnostic ID")
-	}
+	statsTotal = stats["total"].(int)
+	statsSuccess = stats["success"].(int)
+	statsFail = stats["fail"].(int)
 
-	// Test with valid diagnostic ID
-	details, err := module.DiagnosticsDetails(NodesDiagnosticMetricID)
-	if err != nil {
-		t.Error("Expected no error for valid diagnostic ID")
+	if statsTotal != 8 {
+		t.Fatalf("expected nodeScrapeDetails[\"stats\"][\"total\"] to equal 8, got: %d", statsTotal)
+		return
+	}
+	if statsSuccess != 6 {
+		t.Fatalf("expected nodeScrapeDetails[\"stats\"][\"success\"] to equal 6, got: %d", statsSuccess)
+		return
 	}
-	if details["error"] != nil {
-		t.Error("Expected no error for valid diagnostic ID")
+	if statsFail != 2 {
+		t.Fatalf("expected nodeScrapeDetails[\"stats\"][\"fail\"] to equal 2, got: %d", statsFail)
+		return
 	}
 
-	// Check required fields
-	requiredFields := []string{"query", "label", "result", "passed", "docLink"}
-	for _, field := range requiredFields {
-		if details[field] == nil {
-			t.Errorf("Expected field %s to be present", field)
-		}
+	if len(errors) != 2 {
+		t.Fatalf("expected len(nodeScrapeDetails[\"errors\"]) to equal 2, got: %d", len(errors))
+		return
 	}
-}
 
-// Test concurrent access(race condition) to DiagnosticsModule
-func TestDiagnosticsModule_ConcurrentAccess(t *testing.T) {
-	mockUpdater := &MockUpdater{}
-	module := NewDiagnosticsModule(mockUpdater)
-
-	// Test concurrent access to diagnostics
-	done := make(chan bool, 2)
-
-	go func() {
-		for i := 0; i < 100; i++ {
-			module.DiagnosticsDefinitions()
-		}
-		done <- true
-	}()
-
-	go func() {
-		for i := 0; i < 100; i++ {
-			timestamp := time.Now()
-			updateSet := &UpdateSet{
-				Timestamp: timestamp,
-				Updates: []Update{
-					{
-						Name:  KubeNodeStatusCapacityCPUCores,
-						Value: float64(i),
-					},
-				},
-			}
-			module.Update(updateSet)
-		}
-		done <- true
-	}()
-
-	<-done
-	<-done
-	// If we get here without a race condition, the test passes
+	if len(label) == 0 {
+		t.Fatalf("expected len(nodeScrapeDetails[\"label\"]) to be non-zero. Got 0.")
+		return
+	}
 }
 
-// Test reset of diagnostics after details are retrieved
-func TestDiagnosticsModule_ResetAfterDetails(t *testing.T) {
-	mockUpdater := &MockUpdater{}
-	module := NewDiagnosticsModule(mockUpdater)
-
-	// Add some data
-	timestamp := time.Now()
-	updateSet := &UpdateSet{
-		Timestamp: timestamp,
-		Updates: []Update{
-			{
-				Name:  KubeNodeStatusCapacityCPUCores,
-				Value: 4.0,
-			},
+func TestDiagnosticsModule_ScrapeDiagnosticsWithSameScraperName(t *testing.T) {
+	module := NewDiagnosticsModule()
+
+	// dispatch some faux scrape events with same scraper name
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.NodeScraperType,
+		Targets:     8,
+		Errors: []error{
+			fmt.Errorf("failed to scrape node 'foo'"),
+			fmt.Errorf("failed to scrape node 'bar'"),
 		},
-	}
+	})
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.PodScraperType,
+		Targets:     8,
+		Errors: []error{
+			fmt.Errorf("failed to scrape node 'foo'"),
+			fmt.Errorf("failed to scrape node 'bar'"),
+		},
+	})
 
-	module.Update(updateSet)
+	time.Sleep(500 * time.Millisecond)
 
-	// Get details (this should reset the diagnostic)
-	details, err := module.DiagnosticsDetails(NodesDiagnosticMetricID)
+	// for both the diagnostics, if they remain unregistered even after an event was dispatched getting the details would raise an error
+	_, err := module.DiagnosticsDetails(KubernetesNodesScraperDiagnosticID)
 	if err != nil {
-		t.Error("Expected no error for valid diagnostic ID")
+		t.Fatalf("unexpected error: %s", err)
+		return
 	}
-	if details["passed"] != true {
-		t.Error("Expected diagnostic to pass before reset")
-	}
-
-	// Get details again (should be reset)
-	details2, err := module.DiagnosticsDetails(NodesDiagnosticMetricID)
+	_, err = module.DiagnosticsDetails(KubernetesPodsScraperDiagnosticID)
 	if err != nil {
-		t.Error("Expected no error for valid diagnostic ID")
-	}
-	if details2["passed"] != false {
-		t.Error("Expected diagnostic to be reset after first details call")
+		t.Fatalf("unexpected error: %s", err)
+		return
 	}
 }

+ 2 - 1
modules/collector-source/pkg/metric/walinator.go

@@ -37,6 +37,7 @@ type Walinator struct {
 
 func NewWalinator(
 	clusterID string,
+	applicationName string,
 	store storage.Storage,
 	resolutions []*util.Resolution,
 	updater Updater,
@@ -47,7 +48,7 @@ func NewWalinator(
 			limitResolution = resolution
 		}
 	}
-	pathFormatter, err := pathing.NewEventStoragePathFormatter("", clusterID, ControllerEventName)
+	pathFormatter, err := pathing.NewEventStoragePathFormatter(applicationName, clusterID, ControllerEventName)
 	if err != nil {
 		return nil, fmt.Errorf("filed to create path formatter for scrape controller: %s", err.Error())
 	}

+ 3 - 0
modules/collector-source/pkg/metric/walinator_test.go

@@ -58,6 +58,7 @@ func TestWalinator_Update(t *testing.T) {
 		testMetricCollector,
 	)
 	wal, _ := NewWalinator(
+		"test",
 		"test",
 		store,
 		resolutions,
@@ -107,6 +108,7 @@ func TestWalinator_restore(t *testing.T) {
 		testMetricCollector,
 	)
 	wal, _ := NewWalinator(
+		"test",
 		"test",
 		store,
 		resolutions,
@@ -219,6 +221,7 @@ func TestWalinator_clean(t *testing.T) {
 		testMetricCollector,
 	)
 	wal, _ := NewWalinator(
+		"test",
 		"test",
 		store,
 		resolutions,

+ 74 - 0
modules/collector-source/pkg/scrape/clustercache.go

@@ -5,10 +5,12 @@ import (
 	"slices"
 	"strings"
 
+	"github.com/kubecost/events"
 	"github.com/opencost/opencost/core/pkg/clustercache"
 	"github.com/opencost/opencost/core/pkg/log"
 	"github.com/opencost/opencost/core/pkg/source"
 	"github.com/opencost/opencost/core/pkg/util/promutil"
+	"github.com/opencost/opencost/modules/collector-source/pkg/event"
 	"github.com/opencost/opencost/modules/collector-source/pkg/metric"
 	"github.com/opencost/opencost/modules/collector-source/pkg/util"
 	"golang.org/x/exp/maps"
@@ -110,6 +112,14 @@ func (ccs *ClusterCacheScraper) scrapeNodes(nodes []*clustercache.Node) []metric
 		})
 
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.NodeScraperType,
+		Targets:     len(nodes),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 
@@ -137,6 +147,14 @@ func (ccs *ClusterCacheScraper) scrapeDeployments(deployments []*clustercache.De
 			AdditionalInfo: deploymentLabels,
 		})
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.DeploymentScraperType,
+		Targets:     len(deployments),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 
@@ -172,6 +190,14 @@ func (ccs *ClusterCacheScraper) scrapeNamespaces(namespaces []*clustercache.Name
 			AdditionalInfo: namespaceAnnotations,
 		})
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.NamespaceScraperType,
+		Targets:     len(namespaces),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 
@@ -266,6 +292,14 @@ func (ccs *ClusterCacheScraper) scrapePods(pods []*clustercache.Pod) []metric.Up
 			}
 		}
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.PodScraperType,
+		Targets:     len(pods),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 
@@ -298,6 +332,14 @@ func (ccs *ClusterCacheScraper) scrapePVCs(pvcs []*clustercache.PersistentVolume
 			})
 		}
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.PvcScraperType,
+		Targets:     len(pvcs),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 
@@ -334,6 +376,14 @@ func (ccs *ClusterCacheScraper) scrapePVs(pvs []*clustercache.PersistentVolume)
 			})
 		}
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.PvScraperType,
+		Targets:     len(pvs),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 
@@ -361,6 +411,14 @@ func (ccs *ClusterCacheScraper) scrapeServices(services []*clustercache.Service)
 		})
 
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.ServiceScraperType,
+		Targets:     len(services),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 
@@ -387,6 +445,14 @@ func (ccs *ClusterCacheScraper) scrapeStatefulSets(statefulSets []*clustercache.
 			AdditionalInfo: statefulSetLabels,
 		})
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.StatefulSetScraperType,
+		Targets:     len(statefulSets),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 
@@ -414,6 +480,14 @@ func (ccs *ClusterCacheScraper) scrapeReplicaSets(replicaSets []*clustercache.Re
 			})
 		}
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.KubernetesClusterScraperName,
+		ScrapeType:  event.ReplicaSetScraperType,
+		Targets:     len(replicaSets),
+		Errors:      nil,
+	})
+
 	return scrapeResults
 }
 

+ 18 - 8
modules/collector-source/pkg/scrape/dcgm.go

@@ -6,8 +6,10 @@ import (
 
 	"github.com/opencost/opencost/core/pkg/clustercache"
 	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/opencost/opencost/modules/collector-source/pkg/event"
 	"github.com/opencost/opencost/modules/collector-source/pkg/metric"
 	"github.com/opencost/opencost/modules/collector-source/pkg/scrape/target"
+	v1 "k8s.io/api/core/v1"
 )
 
 var dcgmRegex = regexp.MustCompile("(?i)(.*dcgm-exporter.*)")
@@ -19,6 +21,7 @@ func newDCGMScrapper(clusterCache clustercache.ClusterCache) Scraper {
 
 func newDCGMTargetScraper(provider target.TargetProvider) *TargetScraper {
 	return newTargetScrapper(
+		event.DCGMScraperName,
 		provider,
 		[]string{
 			metric.DCGMFIPROFGRENGINEACTIVE,
@@ -29,25 +32,32 @@ func newDCGMTargetScraper(provider target.TargetProvider) *TargetScraper {
 
 type DCGMTargetProvider struct {
 	clusterCache clustercache.ClusterCache
+	port         int
 }
 
 func newDCGMTargetProvider(clusterCache clustercache.ClusterCache) *DCGMTargetProvider {
 	return &DCGMTargetProvider{
 		clusterCache: clusterCache,
+		port:         9400,
 	}
 }
 
 func (p *DCGMTargetProvider) GetTargets() []target.ScrapeTarget {
-	svcs := p.clusterCache.GetAllServices()
+	// NOTE: The proper way to discover these targets is to first identify a Service that
+	// NOTE: matches a specific selector. Then, locate the Endpoints kubernetes resource associated
+	// NOTE: with that Service. This Endpoints resource has a list of all the targetted pods and their
+	// NOTE: addresses. We do _not_ have the Endpoints resource on our cluster cache at the moment,
+	// NOTE: so we'll perform this lookup ourselves.
+	pods := p.clusterCache.GetAllPods()
+
 	var targets []target.ScrapeTarget
-	for _, svc := range svcs {
-		if svc.ClusterIP == "" || !isDCGM(svc.SpecSelector) {
-			continue
+	for _, pod := range pods {
+		if pod.Status.Phase == v1.PodRunning && isDCGM(pod.Labels) {
+			log.Debugf("DCGM: found target: http://%s:%d/metrics", pod.Status.PodIP, p.port)
+
+			t := target.NewUrlTarget(fmt.Sprintf("http://%s:%d/metrics", pod.Status.PodIP, p.port))
+			targets = append(targets, t)
 		}
-		port := 9400
-		log.Debugf("DCGM: found target: http://%s:%d/metrics", svc.ClusterIP, port)
-		t := target.NewUrlTarget(fmt.Sprintf("http://%s:%d/metrics", svc.ClusterIP, port))
-		targets = append(targets, t)
 	}
 
 	return targets

+ 20 - 3
modules/collector-source/pkg/scrape/network.go

@@ -5,8 +5,15 @@ import (
 
 	"github.com/opencost/opencost/core/pkg/clustercache"
 	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/opencost/opencost/modules/collector-source/pkg/event"
 	"github.com/opencost/opencost/modules/collector-source/pkg/metric"
 	"github.com/opencost/opencost/modules/collector-source/pkg/scrape/target"
+	v1 "k8s.io/api/core/v1"
+)
+
+const (
+	NetworkCostsNameLabel     = "network-costs"
+	NetworkCostsInstanceLabel = "kubecost"
 )
 
 func newNetworkScraper(
@@ -19,6 +26,7 @@ func newNetworkScraper(
 
 func newNetworkTargetScraper(provider target.TargetProvider) *TargetScraper {
 	return newTargetScrapper(
+		event.NetworkCostsScraperName,
 		provider,
 		[]string{
 			metric.KubecostPodNetworkEgressBytesTotal,
@@ -40,14 +48,18 @@ func NewNetworkTargetProvider(port int, clusterCache clustercache.ClusterCache)
 }
 
 func (n *NetworkTargetProvider) GetTargets() []target.ScrapeTarget {
+	// NOTE: The proper way to discover these targets is to first identify a Service that
+	// NOTE: matches a specific selector. Then, locate the Endpoints kubernetes resource associated
+	// NOTE: with that Service. This Endpoints resource has a list of all the targetted pods and their
+	// NOTE: addresses. We do _not_ have the Endpoints resource on our cluster cache at the moment,
+	// NOTE: so we'll perform this lookup ourselves.
 	pods := n.clusterCache.GetAllPods()
 
 	var targets []target.ScrapeTarget
 	for _, pod := range pods {
-		instance := pod.Labels["app.kubernetes.io/instance"]
-		name := pod.Labels["app.kubernetes.io/name"]
-		if name == "network-costs" && instance == "kubecost" && pod.Status.Phase == "Running" {
+		if pod.Status.Phase == v1.PodRunning && isNetworkCosts(pod.Labels) {
 			log.Debugf("Network: found target for http://%s:%d/metrics", pod.Status.PodIP, n.port)
+
 			t := target.NewUrlTarget(fmt.Sprintf("http://%s:%d/metrics", pod.Status.PodIP, n.port))
 			targets = append(targets, t)
 		}
@@ -55,3 +67,8 @@ func (n *NetworkTargetProvider) GetTargets() []target.ScrapeTarget {
 
 	return targets
 }
+
+func isNetworkCosts(labels map[string]string) bool {
+	return labels["app.kubernetes.io/name"] == NetworkCostsNameLabel &&
+		labels["app.kubernetes.io/instance"] == NetworkCostsInstanceLabel
+}

+ 2 - 0
modules/collector-source/pkg/scrape/opencost.go

@@ -1,6 +1,7 @@
 package scrape
 
 import (
+	"github.com/opencost/opencost/modules/collector-source/pkg/event"
 	"github.com/opencost/opencost/modules/collector-source/pkg/metric"
 	"github.com/opencost/opencost/modules/collector-source/pkg/scrape/target"
 )
@@ -16,6 +17,7 @@ func newOpenCostScraper() Scraper {
 
 func newOpencostTargetScraper(provider target.TargetProvider) *TargetScraper {
 	return newTargetScrapper(
+		event.OpenCostScraperName,
 		provider,
 		[]string{
 			metric.KubecostClusterManagementCost,

+ 1 - 0
modules/collector-source/pkg/scrape/scraper.go

@@ -5,6 +5,7 @@ import (
 )
 
 type Scraper interface {
+	// Scrape performs the metrics scrape and returns a slice of `Update` instances to apply.
 	Scrape() []metric.Update
 }
 

+ 23 - 0
modules/collector-source/pkg/scrape/statsummary.go

@@ -1,9 +1,11 @@
 package scrape
 
 import (
+	"github.com/kubecost/events"
 	"github.com/opencost/opencost/core/pkg/log"
 	"github.com/opencost/opencost/core/pkg/nodestats"
 	"github.com/opencost/opencost/core/pkg/source"
+	"github.com/opencost/opencost/modules/collector-source/pkg/event"
 	"github.com/opencost/opencost/modules/collector-source/pkg/metric"
 	stats "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
 )
@@ -21,7 +23,21 @@ func newStatSummaryScraper(client nodestats.StatSummaryClient) Scraper {
 func (s *StatSummaryScraper) Scrape() []metric.Update {
 	var scrapeResults []metric.Update
 	nodeStats, err := s.client.GetNodeData()
+
 	if err != nil {
+		var errs []error
+		if multiErr, ok := err.(interface{ Unwrap() []error }); ok {
+			errs = multiErr.Unwrap()
+		} else {
+			errs = []error{err}
+		}
+
+		events.Dispatch(event.ScrapeEvent{
+			ScraperName: event.NodeStatsScraperName,
+			Targets:     len(nodeStats) + len(errs),
+			Errors:      errs,
+		})
+
 		log.Errorf("error retrieving node stat data: %s", err.Error())
 		return scrapeResults
 	}
@@ -135,6 +151,13 @@ func (s *StatSummaryScraper) Scrape() []metric.Update {
 			}
 		}
 	}
+
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: event.NodeStatsScraperName,
+		Targets:     len(nodeStats),
+		Errors:      []error{},
+	})
+
 	return scrapeResults
 }
 

+ 29 - 2
modules/collector-source/pkg/scrape/targetscraper.go

@@ -1,24 +1,30 @@
 package scrape
 
 import (
+	"sync"
+
+	"github.com/kubecost/events"
 	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/opencost/opencost/modules/collector-source/pkg/event"
 	"github.com/opencost/opencost/modules/collector-source/pkg/metric"
 	"github.com/opencost/opencost/modules/collector-source/pkg/scrape/parser"
 	"github.com/opencost/opencost/modules/collector-source/pkg/scrape/target"
 )
 
 type TargetScraper struct {
+	name           string // identifier for the scraper
 	targetProvider target.TargetProvider
 	metricNames    map[string]struct{} // filter for which metrics will be processed
 	includeMetrics bool                // toggle to make metrics an include or exclude list
 }
 
-func newTargetScrapper(provider target.TargetProvider, metricNames []string, includeMetrics bool) *TargetScraper {
+func newTargetScrapper(name string, provider target.TargetProvider, metricNames []string, includeMetrics bool) *TargetScraper {
 	metricSet := make(map[string]struct{})
 	for _, metricName := range metricNames {
 		metricSet[metricName] = struct{}{}
 	}
 	return &TargetScraper{
+		name:           name,
 		targetProvider: provider,
 		metricNames:    metricSet,
 		includeMetrics: includeMetrics,
@@ -27,6 +33,10 @@ func newTargetScrapper(provider target.TargetProvider, metricNames []string, inc
 
 func (s *TargetScraper) Scrape() []metric.Update {
 	targets := s.targetProvider.GetTargets()
+
+	var errLock sync.Mutex
+	var errors []error
+
 	var scrapeFuncs []ScrapeFunc
 	for i := range targets {
 		target := targets[i]
@@ -34,11 +44,19 @@ func (s *TargetScraper) Scrape() []metric.Update {
 			var scrapeResults []metric.Update
 			f, err := target.Load()
 			if err != nil {
+				errLock.Lock()
+				errors = append(errors, err)
+				errLock.Unlock()
+
 				log.Errorf("failed to scrape target: %s", err.Error())
 				return scrapeResults
 			}
 			results, err := parser.Parse(f)
 			if err != nil {
+				errLock.Lock()
+				errors = append(errors, err)
+				errLock.Unlock()
+
 				log.Errorf("failed to parse target: %s", err.Error())
 				return scrapeResults
 			}
@@ -58,5 +76,14 @@ func (s *TargetScraper) Scrape() []metric.Update {
 		scrapeFuncs = append(scrapeFuncs, fn)
 	}
 
-	return concurrentScrape(scrapeFuncs...)
+	updates := concurrentScrape(scrapeFuncs...)
+
+	// dispatch a scrape event for this specific scrape
+	events.Dispatch(event.ScrapeEvent{
+		ScraperName: s.name,
+		Targets:     len(targets),
+		Errors:      errors,
+	})
+
+	return updates
 }

+ 32 - 28
modules/prometheus-source/pkg/prom/metricsquerier.go

@@ -98,7 +98,7 @@ func (pds *PrometheusMetricsQuerier) QueryPVCInfo(start, end time.Time) *source.
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -117,7 +117,7 @@ func (pds *PrometheusMetricsQuerier) QueryPVActiveMinutes(start, end time.Time)
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -136,7 +136,7 @@ func (pds *PrometheusMetricsQuerier) QueryLocalStorageCost(start, end time.Time)
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -161,7 +161,7 @@ func (pds *PrometheusMetricsQuerier) QueryLocalStorageUsedCost(start, end time.T
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -222,7 +222,7 @@ func (pds *PrometheusMetricsQuerier) QueryLocalStorageBytes(start, end time.Time
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -241,7 +241,7 @@ func (pds *PrometheusMetricsQuerier) QueryLocalStorageActiveMinutes(start, end t
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -369,7 +369,7 @@ func (pds *PrometheusMetricsQuerier) QueryNodeActiveMinutes(start, end time.Time
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -388,7 +388,7 @@ func (pds *PrometheusMetricsQuerier) QueryNodeCPUModeTotal(start, end time.Time)
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic("failed to parse duration string passed to QueryNodeCPUModeTotal")
 	}
@@ -406,7 +406,7 @@ func (pds *PrometheusMetricsQuerier) QueryNodeRAMSystemPercent(start, end time.T
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -425,7 +425,7 @@ func (pds *PrometheusMetricsQuerier) QueryNodeRAMUserPercent(start, end time.Tim
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -462,7 +462,7 @@ func (pds *PrometheusMetricsQuerier) QueryLBActiveMinutes(start, end time.Time)
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -481,7 +481,7 @@ func (pds *PrometheusMetricsQuerier) QueryClusterManagementDuration(start, end t
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -520,7 +520,7 @@ func (pds *PrometheusMetricsQuerier) QueryPods(start, end time.Time) *source.Fut
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -539,7 +539,7 @@ func (pds *PrometheusMetricsQuerier) QueryPodsUID(start, end time.Time) *source.
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -725,7 +725,7 @@ func (pds *PrometheusMetricsQuerier) QueryCPUUsageMax(start, end time.Time) *sou
 
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr = pds.durationStringFor(start, end, minsPerResolution)
+	durStr = pds.durationStringFor(start, end, minsPerResolution, false)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -995,7 +995,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetZoneGiB(start, end time.Time) *sour
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1032,7 +1032,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetRegionGiB(start, end time.Time) *so
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1069,7 +1069,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetInternetGiB(start, end time.Time) *
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1106,7 +1106,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetInternetServiceGiB(start, end time.
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1125,7 +1125,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetTransferBytes(start, end time.Time)
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1144,7 +1144,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetZoneIngressGiB(start, end time.Time
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1163,7 +1163,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetRegionIngressGiB(start, end time.Ti
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1182,7 +1182,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetInternetIngressGiB(start, end time.
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1201,7 +1201,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetInternetServiceIngressGiB(start, en
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1220,7 +1220,7 @@ func (pds *PrometheusMetricsQuerier) QueryNetReceiveBytes(start, end time.Time)
 	cfg := pds.promConfig
 	minsPerResolution := cfg.DataResolutionMinutes
 
-	durStr := pds.durationStringFor(start, end, minsPerResolution)
+	durStr := pds.durationStringFor(start, end, minsPerResolution, true)
 	if durStr == "" {
 		panic(fmt.Sprintf("failed to parse duration string passed to %s", queryName))
 	}
@@ -1461,7 +1461,7 @@ func (pds *PrometheusMetricsQuerier) QueryDataCoverage(limitDays int) (time.Time
 	end := time.Now().UTC().Truncate(timeutil.Day).Add(timeutil.Day)
 	start := end.Add(-dur)
 
-	durStr := pds.durationStringFor(start, end, minutesPerDuration)
+	durStr := pds.durationStringFor(start, end, minutesPerDuration, false)
 
 	ctx := pds.promContexts.NewNamedContext(AllocationContextName)
 	queryOldest := fmt.Sprintf(queryFmtOldestSample, cfg.ClusterFilter, durStr, "1h")
@@ -1497,7 +1497,11 @@ func (pds *PrometheusMetricsQuerier) QueryDataCoverage(limitDays int) (time.Time
 	return oldest, newest, nil
 }
 
-func (pds *PrometheusMetricsQuerier) durationStringFor(start, end time.Time, minsPerResolution int) string {
+// durationStringFor simplifies the determination of query duration based on the version of prom and if the function
+// in the query needs all data points in the vector it is provided or if it will extrapolate its own. Functions
+// that extrapolate will add on another resolution if given a duration that is one resolution longer than the intended
+// duration.
+func (pds *PrometheusMetricsQuerier) durationStringFor(start, end time.Time, minsPerResolution int, extrapolated bool) string {
 	dur := end.Sub(start)
 
 	// If using a version of Prometheus where the resolution needs duration offset,
@@ -1509,7 +1513,7 @@ func (pds *PrometheusMetricsQuerier) durationStringFor(start, end time.Time, min
 	// However, OpenCost expects for there to be 13 timestamps where the first
 	// begins at 00:00:00. To achieve this, we must modify our query to
 	// avg(node_total_hourly_cost{}) by (node, provider_id)[65m:5m]
-	if pds.promConfig.IsOffsetResolution {
+	if pds.promConfig.IsOffsetResolution && !extrapolated {
 		// increase the query time by the resolution
 		dur = dur + (time.Duration(minsPerResolution) * time.Minute)
 	}

+ 29 - 0
pkg/cloud/aws/athenaconfiguration.go

@@ -18,6 +18,7 @@ type AthenaConfiguration struct {
 	Workgroup  string     `json:"workgroup"`
 	Account    string     `json:"account"`
 	Authorizer Authorizer `json:"authorizer"`
+	CURVersion string     `json:"curVersion,omitempty"` // "1.0" or "2.0", defaults to "2.0" if not specified
 }
 
 func (ac *AthenaConfiguration) Validate() error {
@@ -53,6 +54,11 @@ func (ac *AthenaConfiguration) Validate() error {
 		return fmt.Errorf("AthenaConfiguration: missing account")
 	}
 
+	// Validate CURVersion if specified
+	if ac.CURVersion != "" && ac.CURVersion != "1.0" && ac.CURVersion != "2.0" {
+		return fmt.Errorf("AthenaConfiguration: invalid CURVersion '%s', must be '1.0' or '2.0'", ac.CURVersion)
+	}
+
 	return nil
 }
 
@@ -103,6 +109,10 @@ func (ac *AthenaConfiguration) Equals(config cloud.Config) bool {
 		return false
 	}
 
+	if ac.CURVersion != thatConfig.CURVersion {
+		return false
+	}
+
 	return true
 }
 
@@ -116,6 +126,7 @@ func (ac *AthenaConfiguration) Sanitize() cloud.Config {
 		Workgroup:  ac.Workgroup,
 		Account:    ac.Account,
 		Authorizer: ac.Authorizer.Sanitize().(Authorizer),
+		CURVersion: ac.CURVersion,
 	}
 }
 
@@ -190,6 +201,18 @@ func (ac *AthenaConfiguration) UnmarshalJSON(b []byte) error {
 	}
 	ac.Authorizer = authorizer
 
+	// Parse CURVersion if present (optional field)
+	if _, ok := fmap["curVersion"]; ok {
+		curVersion, err := cloud.GetInterfaceValue[string](fmap, "curVersion")
+		if err != nil {
+			return fmt.Errorf("AthenaConfiguration: UnmarshalJSON: %w", err)
+		}
+		ac.CURVersion = curVersion
+	} else {
+		// Default to 2.0 if not specified
+		ac.CURVersion = "2.0"
+	}
+
 	return nil
 }
 
@@ -220,6 +243,11 @@ func ConvertAwsAthenaInfoToConfig(aai AwsAthenaInfo) cloud.KeyedConfig {
 
 	var config cloud.KeyedConfig
 	if aai.AthenaTable != "" || aai.AthenaDatabase != "" {
+		// Use CURVersion from config if specified, otherwise default to 2.0
+		curVersion := aai.CURVersion
+		if curVersion == "" {
+			curVersion = "2.0"
+		}
 		config = &AthenaConfiguration{
 			Bucket:     aai.AthenaBucketName,
 			Region:     aai.AthenaRegion,
@@ -229,6 +257,7 @@ func ConvertAwsAthenaInfoToConfig(aai AwsAthenaInfo) cloud.KeyedConfig {
 			Workgroup:  aai.AthenaWorkgroup,
 			Account:    aai.AccountID,
 			Authorizer: authorizer,
+			CURVersion: curVersion,
 		}
 	} else {
 		config = &S3Configuration{

+ 159 - 1
pkg/cloud/aws/athenaconfiguration_test.go

@@ -161,6 +161,62 @@ func TestAthenaConfiguration_Validate(t *testing.T) {
 			},
 			expected: fmt.Errorf("AthenaConfiguration: missing account"),
 		},
+		"valid CUR version 1.0": {
+			config: AthenaConfiguration{
+				Bucket:     "bucket",
+				Region:     "region",
+				Database:   "database",
+				Catalog:    "catalog",
+				Table:      "table",
+				Workgroup:  "workgroup",
+				Account:    "account",
+				Authorizer: &ServiceAccount{},
+				CURVersion: "1.0",
+			},
+			expected: nil,
+		},
+		"valid CUR version 2.0": {
+			config: AthenaConfiguration{
+				Bucket:     "bucket",
+				Region:     "region",
+				Database:   "database",
+				Catalog:    "catalog",
+				Table:      "table",
+				Workgroup:  "workgroup",
+				Account:    "account",
+				Authorizer: &ServiceAccount{},
+				CURVersion: "2.0",
+			},
+			expected: nil,
+		},
+		"valid empty CUR version defaults to 2.0": {
+			config: AthenaConfiguration{
+				Bucket:     "bucket",
+				Region:     "region",
+				Database:   "database",
+				Catalog:    "catalog",
+				Table:      "table",
+				Workgroup:  "workgroup",
+				Account:    "account",
+				Authorizer: &ServiceAccount{},
+				CURVersion: "",
+			},
+			expected: nil,
+		},
+		"invalid CUR version": {
+			config: AthenaConfiguration{
+				Bucket:     "bucket",
+				Region:     "region",
+				Database:   "database",
+				Catalog:    "catalog",
+				Table:      "table",
+				Workgroup:  "workgroup",
+				Account:    "account",
+				Authorizer: &ServiceAccount{},
+				CURVersion: "3.0",
+			},
+			expected: fmt.Errorf("AthenaConfiguration: invalid CURVersion '3.0', must be '1.0' or '2.0'"),
+		},
 	}
 
 	for name, testCase := range testCases {
@@ -515,6 +571,68 @@ func TestAthenaConfiguration_Equals(t *testing.T) {
 			},
 			expected: false,
 		},
+		"different CUR version": {
+			left: AthenaConfiguration{
+				Bucket:    "bucket",
+				Region:    "region",
+				Database:  "database",
+				Catalog:   "catalog",
+				Table:     "table",
+				Workgroup: "workgroup",
+				Account:   "account",
+				Authorizer: &AccessKey{
+					ID:     "id",
+					Secret: "secret",
+				},
+				CURVersion: "1.0",
+			},
+			right: &AthenaConfiguration{
+				Bucket:    "bucket",
+				Region:    "region",
+				Database:  "database",
+				Catalog:   "catalog",
+				Table:     "table",
+				Workgroup: "workgroup",
+				Account:   "account",
+				Authorizer: &AccessKey{
+					ID:     "id",
+					Secret: "secret",
+				},
+				CURVersion: "2.0",
+			},
+			expected: false,
+		},
+		"matching CUR version": {
+			left: AthenaConfiguration{
+				Bucket:    "bucket",
+				Region:    "region",
+				Database:  "database",
+				Catalog:   "catalog",
+				Table:     "table",
+				Workgroup: "workgroup",
+				Account:   "account",
+				Authorizer: &AccessKey{
+					ID:     "id",
+					Secret: "secret",
+				},
+				CURVersion: "1.0",
+			},
+			right: &AthenaConfiguration{
+				Bucket:    "bucket",
+				Region:    "region",
+				Database:  "database",
+				Catalog:   "catalog",
+				Table:     "table",
+				Workgroup: "workgroup",
+				Account:   "account",
+				Authorizer: &AccessKey{
+					ID:     "id",
+					Secret: "secret",
+				},
+				CURVersion: "1.0",
+			},
+			expected: true,
+		},
 		"different config": {
 			left: AthenaConfiguration{
 				Bucket:    "bucket",
@@ -551,7 +669,9 @@ func TestAthenaConfiguration_JSON(t *testing.T) {
 		config AthenaConfiguration
 	}{
 		"Empty Config": {
-			config: AthenaConfiguration{},
+			config: AthenaConfiguration{
+				CURVersion: "2.0", // Default value after JSON unmarshal
+			},
 		},
 		"AccessKey": {
 			config: AthenaConfiguration{
@@ -566,6 +686,7 @@ func TestAthenaConfiguration_JSON(t *testing.T) {
 					ID:     "id",
 					Secret: "secret",
 				},
+				CURVersion: "2.0", // Default value after JSON unmarshal
 			},
 		},
 
@@ -579,6 +700,7 @@ func TestAthenaConfiguration_JSON(t *testing.T) {
 				Workgroup:  "workgroup",
 				Account:    "account",
 				Authorizer: &ServiceAccount{},
+				CURVersion: "2.0", // Default value after JSON unmarshal
 			},
 		},
 		"AssumeRole with AccessKey": {
@@ -597,6 +719,7 @@ func TestAthenaConfiguration_JSON(t *testing.T) {
 					},
 					RoleARN: "12345",
 				},
+				CURVersion: "2.0", // Default value after JSON unmarshal
 			},
 		},
 		"AssumeRole with ServiceAccount": {
@@ -612,6 +735,7 @@ func TestAthenaConfiguration_JSON(t *testing.T) {
 					Authorizer: &ServiceAccount{},
 					RoleARN:    "12345",
 				},
+				CURVersion: "2.0", // Default value after JSON unmarshal
 			},
 		},
 		"RoleArnNil": {
@@ -627,6 +751,7 @@ func TestAthenaConfiguration_JSON(t *testing.T) {
 					Authorizer: nil,
 					RoleARN:    "12345",
 				},
+				CURVersion: "2.0", // Default value after JSON unmarshal
 			},
 		},
 		"AssumeRole with AssumeRole with ServiceAccount": {
@@ -645,6 +770,39 @@ func TestAthenaConfiguration_JSON(t *testing.T) {
 					},
 					RoleARN: "12345",
 				},
+				CURVersion: "2.0", // Default value after JSON unmarshal
+			},
+		},
+		"CUR Version 1.0": {
+			config: AthenaConfiguration{
+				Bucket:    "bucket",
+				Region:    "region",
+				Database:  "database",
+				Catalog:   "catalog",
+				Table:     "table",
+				Workgroup: "workgroup",
+				Account:   "account",
+				Authorizer: &AccessKey{
+					ID:     "id",
+					Secret: "secret",
+				},
+				CURVersion: "1.0",
+			},
+		},
+		"CUR Version 2.0": {
+			config: AthenaConfiguration{
+				Bucket:    "bucket",
+				Region:    "region",
+				Database:  "database",
+				Catalog:   "catalog",
+				Table:     "table",
+				Workgroup: "workgroup",
+				Account:   "account",
+				Authorizer: &AccessKey{
+					ID:     "id",
+					Secret: "secret",
+				},
+				CURVersion: "2.0",
 			},
 		},
 	}

+ 21 - 1
pkg/cloud/aws/athenaintegration.go

@@ -328,8 +328,28 @@ func (ai *AthenaIntegration) GetPartitionWhere(start, end time.Time) string {
 	month := time.Date(start.Year(), start.Month(), 1, 0, 0, 0, 0, time.UTC)
 	endMonth := time.Date(end.Year(), end.Month(), 1, 0, 0, 0, 0, time.UTC)
 	var disjuncts []string
+	
+	// For CUR 2.0, check if billing_period partitions actually exist
+	useBillingPeriodPartitions := false
+	if ai.CURVersion != "1.0" {
+		// Check if billing_period partitions exist in the table
+		if hasBillingPeriod, err := ai.HasBillingPeriodPartitions(); err == nil && hasBillingPeriod {
+			useBillingPeriodPartitions = true
+		}
+	}
+	
 	for !month.After(endMonth) {
-		disjuncts = append(disjuncts, fmt.Sprintf("(year = '%d' AND month = '%d')", month.Year(), month.Month()))
+		if ai.CURVersion == "1.0" {
+			// CUR 1.0 uses year and month columns for partitioning
+			disjuncts = append(disjuncts, fmt.Sprintf("(year = '%d' AND month = '%d')", month.Year(), month.Month()))
+		} else if useBillingPeriodPartitions {
+			// CUR 2.0 with billing_period partitions
+			disjuncts = append(disjuncts, fmt.Sprintf("(billing_period = '%d-%02d')", month.Year(), month.Month()))
+		} else {
+			// CUR 2.0 fallback - use date_format functions (less efficient but works without partitions)
+			disjuncts = append(disjuncts, fmt.Sprintf("(date_format(line_item_usage_start_date, '%%Y') = '%d' AND date_format(line_item_usage_start_date, '%%m') = '%02d')",
+				month.Year(), month.Month()))
+		}
 		month = month.AddDate(0, 1, 0)
 	}
 	str := fmt.Sprintf("(%s)", strings.Join(disjuncts, " OR "))

+ 232 - 0
pkg/cloud/aws/athenaintegration_test.go

@@ -1,8 +1,10 @@
 package aws
 
 import (
+	"fmt"
 	"os"
 	"reflect"
+	"strings"
 	"testing"
 	"time"
 
@@ -396,3 +398,233 @@ func stringsToRow(strings []string) types.Row {
 	}
 	return types.Row{Data: data}
 }
+
+// mockAthenaQuerier is a mock that overrides HasBillingPeriodPartitions for testing
+type mockAthenaQuerier struct {
+	AthenaQuerier
+	hasBillingPeriodPartitions bool
+}
+
+func (m *mockAthenaQuerier) HasBillingPeriodPartitions() (bool, error) {
+	return m.hasBillingPeriodPartitions, nil
+}
+
+// mockAthenaIntegration is a mock that uses mockAthenaQuerier
+type mockAthenaIntegration struct {
+	*mockAthenaQuerier
+}
+
+func (m *mockAthenaIntegration) GetPartitionWhere(start, end time.Time) string {
+	// The partition logic using our mock's HasBillingPeriodPartitions result
+	month := time.Date(start.Year(), start.Month(), 1, 0, 0, 0, 0, time.UTC)
+	endMonth := time.Date(end.Year(), end.Month(), 1, 0, 0, 0, 0, time.UTC)
+	var disjuncts []string
+	
+	// Using our mock's result for billing period partitions
+	useBillingPeriodPartitions := false
+	if m.mockAthenaQuerier.AthenaConfiguration.CURVersion != "1.0" {
+		useBillingPeriodPartitions = m.mockAthenaQuerier.hasBillingPeriodPartitions
+	}
+	
+	for !month.After(endMonth) {
+		if m.mockAthenaQuerier.AthenaConfiguration.CURVersion == "1.0" {
+			// CUR 1.0 uses year and month columns for partitioning
+			disjuncts = append(disjuncts, fmt.Sprintf("(year = '%d' AND month = '%d')", month.Year(), month.Month()))
+		} else if useBillingPeriodPartitions {
+			// CUR 2.0 with billing_period partitions
+			disjuncts = append(disjuncts, fmt.Sprintf("(billing_period = '%d-%02d')", month.Year(), month.Month()))
+		} else {
+			// CUR 2.0 fallback - use date_format functions
+			disjuncts = append(disjuncts, fmt.Sprintf("(date_format(line_item_usage_start_date, '%%Y') = '%d' AND date_format(line_item_usage_start_date, '%%m') = '%02d')",
+				month.Year(), month.Month()))
+		}
+		month = month.AddDate(0, 1, 0)
+	}
+	return fmt.Sprintf("(%s)", strings.Join(disjuncts, " OR "))
+}
+
+func TestAthenaIntegration_GetPartitionWhere(t *testing.T) {
+	testCases := map[string]struct {
+		integration interface{ GetPartitionWhere(time.Time, time.Time) string }
+		start       time.Time
+		end         time.Time
+		expected    string
+	}{
+		"CUR 1.0 single month": {
+			integration: &AthenaIntegration{
+				AthenaQuerier: AthenaQuerier{
+					AthenaConfiguration: AthenaConfiguration{
+						Bucket:     "bucket",
+						Region:     "region",
+						Database:   "database",
+						Table:      "table",
+						Workgroup:  "workgroup",
+						Account:    "account",
+						Authorizer: &ServiceAccount{},
+						CURVersion: "1.0",
+					},
+				},
+			},
+			start:    time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC),
+			end:      time.Date(2024, 1, 25, 0, 0, 0, 0, time.UTC),
+			expected: "((year = '2024' AND month = '1'))",
+		},
+		"CUR 2.0 single month": {
+			integration: &mockAthenaIntegration{
+				mockAthenaQuerier: &mockAthenaQuerier{
+					AthenaQuerier: AthenaQuerier{
+						AthenaConfiguration: AthenaConfiguration{
+							Bucket:     "bucket",
+							Region:     "region",
+							Database:   "database",
+							Table:      "table",
+							Workgroup:  "workgroup",
+							Account:    "account",
+							Authorizer: &ServiceAccount{},
+							CURVersion: "2.0",
+						},
+					},
+					hasBillingPeriodPartitions: true,
+				},
+			},
+			start:    time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC),
+			end:      time.Date(2024, 1, 25, 0, 0, 0, 0, time.UTC),
+			expected: "((billing_period = '2024-01'))",
+		},
+		"CUR 1.0 multiple months": {
+			integration: &AthenaIntegration{
+				AthenaQuerier: AthenaQuerier{
+					AthenaConfiguration: AthenaConfiguration{
+						Bucket:     "bucket",
+						Region:     "region",
+						Database:   "database",
+						Table:      "table",
+						Workgroup:  "workgroup",
+						Account:    "account",
+						Authorizer: &ServiceAccount{},
+						CURVersion: "1.0",
+					},
+				},
+			},
+			start:    time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC),
+			end:      time.Date(2024, 3, 10, 0, 0, 0, 0, time.UTC),
+			expected: "((year = '2024' AND month = '1') OR (year = '2024' AND month = '2') OR (year = '2024' AND month = '3'))",
+		},
+		"CUR 2.0 multiple months": {
+			integration: &mockAthenaIntegration{
+				mockAthenaQuerier: &mockAthenaQuerier{
+					AthenaQuerier: AthenaQuerier{
+						AthenaConfiguration: AthenaConfiguration{
+							Bucket:     "bucket",
+							Region:     "region",
+							Database:   "database",
+							Table:      "table",
+							Workgroup:  "workgroup",
+							Account:    "account",
+							Authorizer: &ServiceAccount{},
+							CURVersion: "2.0",
+						},
+					},
+					hasBillingPeriodPartitions: true,
+				},
+			},
+			start:    time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC),
+			end:      time.Date(2024, 3, 10, 0, 0, 0, 0, time.UTC),
+			expected: "((billing_period = '2024-01') OR (billing_period = '2024-02') OR (billing_period = '2024-03'))",
+		},
+		"CUR 2.0 across year boundary": {
+			integration: &mockAthenaIntegration{
+				mockAthenaQuerier: &mockAthenaQuerier{
+					AthenaQuerier: AthenaQuerier{
+						AthenaConfiguration: AthenaConfiguration{
+							Bucket:     "bucket",
+							Region:     "region",
+							Database:   "database",
+							Table:      "table",
+							Workgroup:  "workgroup",
+							Account:    "account",
+							Authorizer: &ServiceAccount{},
+							CURVersion: "2.0",
+						},
+					},
+					hasBillingPeriodPartitions: true,
+				},
+			},
+			start:    time.Date(2023, 12, 15, 0, 0, 0, 0, time.UTC),
+			end:      time.Date(2024, 2, 10, 0, 0, 0, 0, time.UTC),
+			expected: "((billing_period = '2023-12') OR (billing_period = '2024-01') OR (billing_period = '2024-02'))",
+		},
+		"CUR 1.0 across year boundary": {
+			integration: &AthenaIntegration{
+				AthenaQuerier: AthenaQuerier{
+					AthenaConfiguration: AthenaConfiguration{
+						Bucket:     "bucket",
+						Region:     "region",
+						Database:   "database",
+						Table:      "table",
+						Workgroup:  "workgroup",
+						Account:    "account",
+						Authorizer: &ServiceAccount{},
+						CURVersion: "1.0",
+					},
+				},
+			},
+			start:    time.Date(2023, 12, 15, 0, 0, 0, 0, time.UTC),
+			end:      time.Date(2024, 2, 10, 0, 0, 0, 0, time.UTC),
+			expected: "((year = '2023' AND month = '12') OR (year = '2024' AND month = '1') OR (year = '2024' AND month = '2'))",
+		},
+		"Default CUR version (empty string defaults to 2.0)": {
+			integration: &mockAthenaIntegration{
+				mockAthenaQuerier: &mockAthenaQuerier{
+					AthenaQuerier: AthenaQuerier{
+						AthenaConfiguration: AthenaConfiguration{
+							Bucket:     "bucket",
+							Region:     "region",
+							Database:   "database",
+							Table:      "table",
+							Workgroup:  "workgroup",
+							Account:    "account",
+							Authorizer: &ServiceAccount{},
+							CURVersion: "",
+						},
+					},
+					hasBillingPeriodPartitions: true,
+				},
+			},
+			start:    time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC),
+			end:      time.Date(2024, 1, 25, 0, 0, 0, 0, time.UTC),
+			expected: "((billing_period = '2024-01'))",
+		},
+		"CUR 2.0 fallback when no billing_period partitions": {
+			integration: &mockAthenaIntegration{
+				mockAthenaQuerier: &mockAthenaQuerier{
+					AthenaQuerier: AthenaQuerier{
+						AthenaConfiguration: AthenaConfiguration{
+							Bucket:     "bucket",
+							Region:     "region",
+							Database:   "database",
+							Table:      "table",
+							Workgroup:  "workgroup",
+							Account:    "account",
+							Authorizer: &ServiceAccount{},
+							CURVersion: "2.0",
+						},
+					},
+					hasBillingPeriodPartitions: false, // No billing_period partitions
+				},
+			},
+			start:    time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC),
+			end:      time.Date(2024, 1, 25, 0, 0, 0, 0, time.UTC),
+			expected: "((date_format(line_item_usage_start_date, '%Y') = '2024' AND date_format(line_item_usage_start_date, '%m') = '01'))",
+		},
+	}
+
+	for name, testCase := range testCases {
+		t.Run(name, func(t *testing.T) {
+			actual := testCase.integration.GetPartitionWhere(testCase.start, testCase.end)
+			if actual != testCase.expected {
+				t.Errorf("GetPartitionWhere() mismatch:\nActual:   %s\nExpected: %s", actual, testCase.expected)
+			}
+		})
+	}
+}

+ 26 - 0
pkg/cloud/aws/athenaquerier.go

@@ -63,6 +63,32 @@ func (aq *AthenaQuerier) GetColumns() (map[string]bool, error) {
 	return columnSet, nil
 }
 
+// HasBillingPeriodPartitions checks if the table uses billing_period partitioning
+// by querying SHOW PARTITIONS and looking for billing_period partition keys
+func (aq *AthenaQuerier) HasBillingPeriodPartitions() (bool, error) {
+	// Use SHOW PARTITIONS to check if billing_period partitions exist
+	query := fmt.Sprintf("SHOW PARTITIONS \"%s\"", aq.Table)
+	hasBillingPeriodPartition := false
+	
+	athenaErr := aq.Query(context.TODO(), query, GetAthenaQueryFunc(func(row types.Row) {
+		if len(row.Data) > 0 && row.Data[0].VarCharValue != nil {
+			partitionValue := *row.Data[0].VarCharValue
+			// Check if partition follows billing_period=YYYY-MM format
+			if strings.HasPrefix(partitionValue, "billing_period=") {
+				hasBillingPeriodPartition = true
+			}
+		}
+	}))
+
+	if athenaErr != nil {
+		// If SHOW PARTITIONS fails, assume no billing_period partitions
+		log.Debugf("AthenaQuerier[%s]: SHOW PARTITIONS failed: %s", aq.Key(), athenaErr.Error())
+		return false, athenaErr
+	}
+
+	return hasBillingPeriodPartition, nil
+}
+
 func (aq *AthenaQuerier) Query(ctx context.Context, query string, fn func(*athena.GetQueryResultsOutput) bool) error {
 	err := aq.Validate()
 	if err != nil {

+ 5 - 0
pkg/cloud/aws/provider.go

@@ -386,6 +386,7 @@ type AwsAthenaInfo struct {
 	ServiceKeySecret string `json:"serviceKeySecret"`
 	AccountID        string `json:"projectID"`
 	MasterPayerARN   string `json:"masterPayerARN"`
+	CURVersion       string `json:"curVersion"` // "1.0" or "2.0", defaults to "2.0" if not specified
 }
 
 // IsEmpty returns true if all fields in config are empty, false if not.
@@ -501,6 +502,7 @@ func (aws *AWS) GetAWSAthenaInfo() (*AwsAthenaInfo, error) {
 		ServiceKeySecret: aak.SecretAccessKey,
 		AccountID:        config.AthenaProjectID,
 		MasterPayerARN:   config.MasterPayerARN,
+		CURVersion:       config.AthenaCURVersion,
 	}, nil
 }
 
@@ -561,6 +563,9 @@ func (aws *AWS) UpdateConfig(r io.Reader, updateType string) (*models.CustomPric
 				c.MasterPayerARN = aai.MasterPayerARN
 			}
 			c.AthenaProjectID = aai.AccountID
+			if aai.CURVersion != "" {
+				c.AthenaCURVersion = aai.CURVersion
+			}
 		} else {
 			a := make(map[string]interface{})
 			err := json.NewDecoder(r).Decode(&a)

+ 2 - 0
pkg/cloud/config/configurations_test.go

@@ -116,6 +116,7 @@ var (
 						ID:     "id",
 						Secret: "secret",
 					},
+					CURVersion: "2.0",
 				},
 			},
 		},
@@ -149,6 +150,7 @@ var (
 						Authorizer: &aws.ServiceAccount{},
 						RoleARN:    "roleArn",
 					},
+					CURVersion: "2.0",
 				},
 			},
 		},

+ 3 - 0
pkg/cloud/config/controller_test.go

@@ -22,6 +22,7 @@ var validAthenaConf = &aws.AthenaConfiguration{
 	Workgroup:  "workgroup",
 	Account:    "account",
 	Authorizer: &aws.ServiceAccount{},
+	CURVersion: "2.0",
 }
 
 // Config with the same key as the baseline but is not equal to it because of the change in the non-keyed property Workgroup
@@ -33,6 +34,7 @@ var validAthenaConfModifiedProperty = &aws.AthenaConfiguration{
 	Workgroup:  "workgroup1",
 	Account:    "account",
 	Authorizer: &aws.ServiceAccount{},
+	CURVersion: "2.0",
 }
 
 // Config with the same key as baseline but is invalid due to missing Authorizer
@@ -44,6 +46,7 @@ var invalidAthenaConf = &aws.AthenaConfiguration{
 	Workgroup:  "workgroup",
 	Account:    "account",
 	Authorizer: nil,
+	CURVersion: "2.0",
 }
 
 // A valid config with a different key from the baseline

+ 943 - 0
pkg/cloud/digitalocean/provider.go

@@ -0,0 +1,943 @@
+package digitalocean
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"math"
+	"net/http"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/opencost/opencost/core/pkg/log"
+	"github.com/opencost/opencost/pkg/cloud/models"
+	"github.com/opencost/opencost/pkg/env"
+)
+
+const fallbackPVHourlyRate = 0.00015
+
+type DOKS struct {
+	PricingURL            string
+	Cache                 *PricingCache
+	Products              map[string][]DOProduct
+	Config                models.ProviderConfig
+	Clientset             clustercache.ClusterCache
+	ClusterManagementCost float64
+}
+
+type PricingCache struct {
+	data       *DOResponse
+	lastUpdate time.Time
+	mu         sync.Mutex
+}
+
+type DOResponse struct {
+	Products []DOProduct `json:"products"`
+}
+
+type DOProduct struct {
+	SKU         string        `json:"sku"`
+	ItemType    string        `json:"itemType"`
+	DisplayName string        `json:"displayName"`
+	Category    string        `json:"category"`
+	Prices      []DOPrice     `json:"prices"`
+	Allowances  []DOAllowance `json:"allowances,omitempty"`
+	Attributes  []DOAttribute `json:"attributes,omitempty"`
+	EffectiveAt string        `json:"effectiveAt"`
+}
+
+type DOPrice struct {
+	Unit      string `json:"unit"`
+	Rate      string `json:"rate"`
+	MinAmount string `json:"minAmount"`
+	MaxAmount string `json:"maxAmount"`
+	MinUsage  string `json:"minUsage"`
+	MaxUsage  string `json:"maxUsage"`
+	Currency  string `json:"currency"`
+	Region    string `json:"region"`
+}
+
+type DOAllowance struct {
+	Quantity    string `json:"quantity"`
+	Unit        string `json:"unit"`
+	AllowanceId string `json:"allowanceId"`
+	Schedule    string `json:"schedule"`
+}
+
+type DOAttribute struct {
+	Name  string `json:"name"`
+	Value string `json:"value"`
+	Unit  string `json:"unit"`
+}
+
+func NewDOKSProvider(pricingURL string) *DOKS {
+	return &DOKS{
+		PricingURL: pricingURL,
+		Cache:      &PricingCache{},
+		Products:   make(map[string][]DOProduct),
+	}
+}
+
+func NewPricingCache() *PricingCache {
+	return &PricingCache{
+		data:       nil,
+		lastUpdate: time.Time{},
+	}
+}
+
+func (do *DOKS) fetchPricingData() (*DOResponse, error) {
+	do.Cache.mu.Lock()
+	defer do.Cache.mu.Unlock()
+
+	// Return cached data if still valid
+	if do.Cache.data != nil && time.Since(do.Cache.lastUpdate) < time.Hour {
+		log.Debugf("Using cached pricing data (last updated: %v)", do.Cache.lastUpdate)
+		return do.Cache.data, nil
+	}
+
+	pricingURL := do.PricingURL
+	if pricingURL == "" {
+		pricingURL = env.GetDOKSPricingURL()
+	}
+	log.Infof("Fetching DigitalOcean pricing from: %s", pricingURL)
+
+	resp, err := http.Get(pricingURL)
+	if err != nil {
+		log.Warnf("Failed to fetch pricing from DigitalOcean: %v", err)
+		return nil, fmt.Errorf("pricing API fetch error: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		log.Warnf("Pricing API returned unexpected status: %d", resp.StatusCode)
+		return nil, fmt.Errorf("pricing API returned status: %d", resp.StatusCode)
+	}
+
+	var data DOResponse
+	if err := json.NewDecoder(resp.Body).Decode(&data); err != nil {
+		log.Errorf("Failed to decode pricing JSON: %v", err)
+		return nil, fmt.Errorf("failed to decode pricing response: %w", err)
+	}
+
+	// Categorize products by item type
+	categorized := make(map[string][]DOProduct)
+	for _, product := range data.Products {
+		log.Debugf("Indexing product: SKU=%s, ItemType=%s, Name=%s", product.SKU, product.ItemType, product.DisplayName)
+		categorized[product.ItemType] = append(categorized[product.ItemType], product)
+	}
+
+	// Cache and return
+	do.Products = categorized
+	do.Cache.data = &data
+	do.Cache.lastUpdate = time.Now()
+
+	log.Infof("Successfully updated DigitalOcean pricing cache (%d products)", len(data.Products))
+	return do.Cache.data, nil
+}
+
+// DO Node
+type doksKey struct {
+	Labels     map[string]string
+	ProviderID string
+}
+
+func (do *DOKS) GetKey(labels map[string]string, n *clustercache.Node) models.Key {
+	var providerID string
+	if n != nil {
+		providerID = n.SpecProviderID
+		if providerID != "" {
+			labels["providerID"] = providerID
+		}
+
+		cpuQty := n.Status.Capacity["cpu"]
+		cpuCores := cpuQty.MilliValue() / 1000
+		labels["node.opencost.io/cpu"] = fmt.Sprintf("%d", cpuCores)
+		log.Debugf("Set label 'node.opencost.io/cpu' = %d", cpuCores)
+
+		memQty := n.Status.Capacity["memory"]
+		memGiB := int(math.Ceil(float64(memQty.Value()) / (1024 * 1024 * 1024)))
+		labels["node.opencost.io/ram"] = fmt.Sprintf("%d", memGiB)
+		log.Debugf("Set label 'node.opencost.io/ram' = %d", memGiB)
+
+	}
+
+	return &doksKey{
+		Labels:     labels,
+		ProviderID: providerID,
+	}
+}
+
+func (k *doksKey) ID() string {
+	if it, ok := k.Labels["node.kubernetes.io/instance-type"]; ok {
+		return it
+	}
+	if it, ok := k.Labels["beta.kubernetes.io/instance-type"]; ok {
+		return it
+	}
+	log.Debugf("doksKey: missing instance-type. Labels: %+v", k.Labels)
+	return ""
+}
+
+func (k *doksKey) Features() string {
+	features := map[string]string{}
+
+	for _, label := range []string{
+		"node.kubernetes.io/instance-type",
+		"beta.kubernetes.io/instance-type",
+		"kubernetes.io/arch",
+		"beta.kubernetes.io/arch",
+		"node.opencost.io/ram",
+		"node.opencost.io/cpu",
+	} {
+		if val, ok := k.Labels[label]; ok {
+			features[label] = val
+		}
+	}
+
+	var parts []string
+	for k, v := range features {
+		parts = append(parts, fmt.Sprintf("%s=%s", k, v))
+	}
+
+	sort.Strings(parts)
+	return strings.Join(parts, ",")
+}
+
+func (k *doksKey) GPUType() string {
+	return ""
+}
+
+func (k *doksKey) String() string {
+	if instanceType, ok := k.Labels["node.kubernetes.io/instance-type"]; ok {
+		return instanceType
+	}
+	if instanceType, ok := k.Labels["beta.kubernetes.io/instance-type"]; ok {
+		return instanceType
+	}
+	return ""
+}
+
+func (k *doksKey) GPUCount() int {
+	return 0
+}
+
+type SlugBase struct {
+	BaseSlug   string
+	BaseCost   float64
+	BaseVCPU   int
+	BaseRAMGiB int
+}
+
+type slugSeeds struct {
+	BaseVCPU    int
+	BaseHourly  float64
+	RamPerVCPU  int
+	IntelHourly float64
+}
+
+var slugFamilySeed = map[string]slugSeeds{
+	"c":     {BaseVCPU: 4, BaseHourly: 0.12500, RamPerVCPU: 2, IntelHourly: 0.16220},
+	"c2":    {BaseVCPU: 4, BaseHourly: 0.13988, RamPerVCPU: 2, IntelHourly: 0.18155},
+	"g":     {BaseVCPU: 4, BaseHourly: 0.18750, RamPerVCPU: 4, IntelHourly: 0.22470},
+	"gd":    {BaseVCPU: 4, BaseHourly: 0.20238, RamPerVCPU: 4, IntelHourly: 0.23512},
+	"m":     {BaseVCPU: 8, BaseHourly: 0.50000, RamPerVCPU: 8, IntelHourly: 0.58929},
+	"m3":    {BaseVCPU: 8, BaseHourly: 0.61905, RamPerVCPU: 8, IntelHourly: 0.65476},
+	"m6":    {BaseVCPU: 8, BaseHourly: 0.77976, RamPerVCPU: 8, IntelHourly: 0},
+	"s":     {BaseVCPU: 4, BaseHourly: 0.07143, RamPerVCPU: 2, IntelHourly: 0.08333},
+	"so":    {BaseVCPU: 8, BaseHourly: 0.77976, RamPerVCPU: 8, IntelHourly: 0.77976},
+	"so1_5": {BaseVCPU: 8, BaseHourly: 0.97024, RamPerVCPU: 8, IntelHourly: 0.82738},
+}
+
+// TODO Refine GPU pricing and move to GPU method once GPUs are fully GA
+var gpuHourly = map[string]float64{
+	"gpu-4000adax1-20gb": 0.76,
+	"gpu-6000adax1-48gb": 1.57,
+	"gpu-h100x1-80gb":    3.39,
+	"gpu-h100x8-640gb":   23.92,
+	"gpu-h200x1-141gb":   3.44,
+	"gpu-h200x8-1128gb":  27.52,
+	"gpu-l40sx1-48gb":    1.57,
+	"gpu-mi300x1-192gb":  1.99,
+	"gpu-mi300x8-1536gb": 15.92,
+}
+
+var (
+	reVCpu        = regexp.MustCompile(`(\d+)\s*vcpu`)
+	reRAM         = regexp.MustCompile(`(\d+)\s*gb`)
+	reSimpleCount = regexp.MustCompile(`^[a-z0-9_]+-(\d+)(?:-|$)`)
+)
+
+func extractResources(slug string) (int, int, bool) {
+	parts := strings.Split(slug, "-")
+
+	var vcpu, ram int
+	var foundVCPU, foundRAM bool
+
+	for _, part := range parts {
+		switch {
+		case strings.HasSuffix(part, "vcpu"):
+			v, err := strconv.Atoi(strings.TrimSuffix(part, "vcpu"))
+			if err == nil {
+				vcpu = v
+				foundVCPU = true
+			}
+		case strings.HasSuffix(part, "gb"):
+			v, err := strconv.Atoi(strings.TrimSuffix(part, "gb"))
+			if err == nil {
+				ram = v
+				foundRAM = true
+			}
+		default:
+			// Fallback case for just "8", "16", etc.
+			v, err := strconv.Atoi(part)
+			if err == nil {
+				if !foundVCPU {
+					vcpu = v
+					foundVCPU = true
+				} else if !foundRAM {
+					ram = v
+					foundRAM = true
+				}
+			}
+		}
+	}
+
+	// If vCPU found but not RAM, assume RAM is 2x vCPU, works for all c families
+	if foundVCPU && !foundRAM {
+		ram = 2 * vcpu
+		foundRAM = true
+	}
+
+	return vcpu, ram, foundVCPU && foundRAM
+}
+
+// Estimate cost based on slug pattern and scale from base slugs which are seeded
+func estimateCostFromSlug(slug string) (float64, int, int, bool) {
+	s := strings.ToLower(strings.TrimSpace(slug))
+
+	// GPUs are to be handled as a separate case
+	if strings.HasPrefix(s, "gpu-") {
+		if h, ok := gpuHourly[s]; ok {
+			vcpu, ram := extractVCpuRAMGuess(s, "", 0) // we don’t rely on these for pricing
+			return h, vcpu, ram, true
+		}
+		return 0, 0, 0, false
+	}
+
+	dashPosition := strings.IndexByte(s, '-')
+	if dashPosition <= 0 {
+		return 0, 0, 0, false
+	}
+	family := s[:dashPosition]
+	seed, ok := slugFamilySeed[family]
+	if !ok {
+		return 0, 0, 0, false
+	}
+
+	hasIntel := strings.Contains(s, "-intel")
+
+	vcpu, ramGiB := extractVCpuRAMGuess(s, family, seed.RamPerVCPU)
+	if vcpu == 0 {
+		return 0, 0, 0, false
+	}
+	if ramGiB == 0 && seed.RamPerVCPU > 0 {
+		ramGiB = seed.RamPerVCPU * vcpu
+	}
+	scale := float64(vcpu) / float64(seed.BaseVCPU)
+	hourly := seed.BaseHourly * scale
+
+	if hasIntel && seed.IntelHourly > 0 && seed.BaseHourly > 0 {
+		mult := seed.IntelHourly / seed.BaseHourly
+		hourly *= mult
+	}
+
+	return hourly, vcpu, ramGiB, true
+}
+
+// TODO Fix GPU Pricing after GA
+func extractVCpuRAMGuess(slugLower, family string, ramPerVCPU int) (vcpu int, ramGiB int) {
+	// Regex for matching CPU, we try to find CPU first
+	// If RAM not found, we can multiply VCPU by 2 to find it
+	if m := reVCpu.FindStringSubmatch(slugLower); len(m) == 2 {
+		if n, _ := strconv.Atoi(m[1]); n > 0 {
+			vcpu = n
+		}
+	}
+	if m := reRAM.FindStringSubmatch(slugLower); len(m) == 2 {
+		if n, _ := strconv.Atoi(m[1]); n > 0 {
+			ramGiB = n
+		}
+	}
+	if vcpu == 0 {
+		if m := reSimpleCount.FindStringSubmatch(slugLower); len(m) == 2 {
+			if n, _ := strconv.Atoi(m[1]); n > 0 {
+				vcpu = n
+			}
+		}
+	}
+
+	if ramGiB == 0 && vcpu > 0 && ramPerVCPU > 0 {
+		ramGiB = vcpu * ramPerVCPU
+	}
+	return
+}
+
+var (
+	vcpuRegex = regexp.MustCompile(`(?i)(\d+)\s*VCPU`)
+	ramRegex  = regexp.MustCompile(`(?i)(\d+)\s*GB\s*RAM`)
+)
+
+func extractSpecsFromDisplayName(name string) (vcpu int, memoryGiB int, err error) {
+	vcpuMatches := vcpuRegex.FindStringSubmatch(name)
+	ramMatches := ramRegex.FindStringSubmatch(name)
+
+	if len(vcpuMatches) < 2 || len(ramMatches) < 2 {
+		return 0, 0, fmt.Errorf("could not extract specs from displayName: %q", name)
+	}
+
+	vcpu, err = strconv.Atoi(vcpuMatches[1])
+	if err != nil {
+		return 0, 0, fmt.Errorf("invalid vCPU format: %v", err)
+	}
+
+	memoryGiB, err = strconv.Atoi(ramMatches[1])
+	if err != nil {
+		return 0, 0, fmt.Errorf("invalid RAM format: %v", err)
+	}
+
+	return vcpu, memoryGiB, nil
+}
+
+func parseResources(features string) (int, int, error) {
+	parts := strings.Split(features, ",")
+	var cpu, ram int
+	for _, part := range parts {
+		kv := strings.SplitN(part, "=", 2)
+		if len(kv) != 2 {
+			continue
+		}
+		switch kv[0] {
+		case "node.opencost.io/cpu":
+			val, err := strconv.Atoi(kv[1])
+			if err == nil {
+				cpu = val
+			}
+		case "node.opencost.io/ram":
+			val, err := strconv.Atoi(kv[1])
+			if err == nil {
+				ram = val
+			}
+		}
+	}
+
+	if cpu > 0 && ram > 0 {
+		return cpu, ram, nil
+	}
+	return 0, 0, fmt.Errorf("cpu or ram not found in features")
+}
+
+func (do *DOKS) NodePricing(key models.Key) (*models.Node, models.PricingMetadata, error) {
+	log.Debugf("Fetching DigitalOcean pricing data (key: %s)", key)
+
+	// Try fetching catalog; fallback is okay
+	_, err := do.fetchPricingData()
+	if err != nil {
+		log.Warnf("Failed to fetch catalog: %v. Will try estimation or fallback.", err)
+	}
+
+	arch := parseArch(key.Features())
+	slug := key.ID()
+
+	// Try parsing vCPU/RAM from labels
+	vcpu, ram, err := parseResources(key.Features())
+	if err != nil || vcpu == 0 || ram == 0 {
+		log.Infof("Failed to extract CPU/RAM from features. Trying slug: %s", slug)
+
+		var ok bool
+		// Try getting from slug (e.g., "s-2vcpu-4gb")
+		vcpu, ram, ok = extractResources(slug)
+		if !ok {
+			// Fallback: RAM = 2x CPU if CPU is known, cases like c-2
+			if vcpu > 0 {
+				ram = vcpu * 2
+				log.Warnf("Only CPU found. Assuming RAM = 2 * CPU → %dGiB", ram)
+			} else {
+				log.Warnf("Could not extract vCPU/RAM from features or slug. Returning fallback.")
+				return fallbackNode(slug)
+			}
+		}
+	}
+
+	// Search for matching product in the DigitalOcean catalog
+	for _, products := range do.Products {
+		for _, product := range products {
+			if product.ItemType != "K8S_WORKER_NODE" {
+				continue
+			}
+
+			productVCPU, productRAM, err := extractSpecsFromDisplayName(product.DisplayName)
+			if err != nil {
+				continue
+			}
+
+			if productVCPU == vcpu && productRAM == ram {
+				node, meta, err := do.productToNode(product, vcpu, ram, arch)
+				if err != nil {
+					log.Warnf("Failed to convert product %s to node: %v", product.SKU, err)
+					continue
+				}
+				return node, meta, nil
+			}
+		}
+	}
+
+	log.Warnf("No matching product found for slug %s (vCPU: %d, RAM: %d), falling back", slug, vcpu, ram)
+	return fallbackNode(slug)
+}
+
+func parseArch(features string) string {
+	parts := strings.Split(features, ",")
+	for _, part := range parts {
+		pair := strings.SplitN(part, "=", 2)
+		if len(pair) == 2 && (pair[0] == "kubernetes.io/arch" || pair[0] == "beta.kubernetes.io/arch") {
+			return pair[1]
+		}
+	}
+	return ""
+}
+
+func (do *DOKS) productToNode(product DOProduct, vcpu int, ramGiB int, arch string) (*models.Node, models.PricingMetadata, error) {
+	if len(product.Prices) == 0 {
+		return nil, models.PricingMetadata{
+			Currency: "USD",
+			Source:   "digitalocean",
+			Warnings: []string{"product has no prices"},
+		}, fmt.Errorf("no pricing data for product: %s", product.SKU)
+	}
+
+	price := product.Prices[0]
+	rate, err := strconv.ParseFloat(price.Rate, 64)
+	if err != nil {
+		return nil, models.PricingMetadata{
+			Currency: "USD",
+			Source:   "digitalocean",
+			Warnings: []string{"invalid price rate format"},
+		}, fmt.Errorf("invalid rate for %s: %v", product.SKU, err)
+	}
+
+	var hourlyCost float64
+	switch price.Unit {
+	case "ITEM_PER_SECOND":
+		hourlyCost = rate * 3600
+	case "ITEM_PER_HOUR":
+		hourlyCost = rate
+	default:
+		return nil, models.PricingMetadata{
+			Currency: "USD",
+			Source:   "digitalocean",
+			Warnings: []string{"unsupported pricing unit"},
+		}, fmt.Errorf("unsupported unit: %s", price.Unit)
+	}
+
+	// Assuming CPU and RAM are priced similarly
+	totalUnits := float64(vcpu + ramGiB)
+	vcpuCost := hourlyCost * float64(vcpu) / totalUnits
+	ramCost := hourlyCost * float64(ramGiB) / totalUnits
+
+	if arch == "" {
+		arch = "amd64"
+	}
+
+	return &models.Node{
+			Cost:         fmt.Sprintf("%.5f", hourlyCost),
+			VCPUCost:     fmt.Sprintf("%.5f", vcpuCost),
+			RAMCost:      fmt.Sprintf("%.5f", ramCost),
+			VCPU:         strconv.Itoa(vcpu),
+			RAM:          fmt.Sprintf("%dGiB", ramGiB),
+			InstanceType: product.DisplayName,
+			Region:       price.Region,
+			UsageType:    product.ItemType,
+			PricingType:  models.DefaultPrices,
+			ArchType:     arch,
+		}, models.PricingMetadata{
+			Currency: "USD",
+			Source:   "digitalocean",
+		}, nil
+}
+
+func fallbackNode(slug string) (*models.Node, models.PricingMetadata, error) {
+	if cost, vcpu, ram, ok := estimateCostFromSlug(slug); ok {
+		totalUnits := float64(vcpu + ram)
+		if totalUnits == 0 {
+			return nil, models.PricingMetadata{
+				Currency: "USD",
+				Source:   "static-fallback",
+				Warnings: []string{"invalid vCPU and RAM (0) for fallback"},
+			}, fmt.Errorf("invalid fallback spec: totalUnits=0")
+		}
+
+		unitCost := cost / totalUnits
+
+		log.Infof("FallbackNode (estimated): %s , hourly=%.5f, vcpuUnit=%.5f, ramUnit=%.5f", slug, cost, unitCost, unitCost)
+
+		return &models.Node{
+				Cost:         fmt.Sprintf("%.5f", cost),
+				VCPUCost:     fmt.Sprintf("%.5f", unitCost),
+				RAMCost:      fmt.Sprintf("%.5f", unitCost),
+				VCPU:         strconv.Itoa(vcpu),
+				RAM:          fmt.Sprintf("%dGiB", ram),
+				InstanceType: slug,
+				Region:       "global",
+				UsageType:    "static-fallback",
+				PricingType:  models.DefaultPrices,
+				ArchType:     "amd64",
+			}, models.PricingMetadata{
+				Currency: "USD",
+				Source:   "static-fallback",
+				Warnings: []string{"used estimated fallback"},
+			}, nil
+	}
+
+	return nil, models.PricingMetadata{
+		Currency: "USD",
+		Source:   "none",
+		Warnings: []string{"no fallback available"},
+	}, fmt.Errorf("no fallback pricing for slug: %s", slug)
+}
+
+type doksPVKey struct {
+	id           string
+	storageClass string
+	sizeBytes    int64
+	ProviderID   string
+	region       string
+}
+
+func (k *doksPVKey) ID() string {
+	return k.ProviderID
+}
+
+func (k *doksPVKey) SizeGiB() int64 {
+	return k.sizeBytes / (1024 * 1024 * 1024)
+}
+
+// Features Only one type of PV
+func (k *doksPVKey) Features() string {
+	return ""
+}
+
+func (k *doksPVKey) GetStorageClass() string {
+	return k.storageClass
+}
+
+func (do *DOKS) PVPricing(key models.PVKey) (*models.PV, error) {
+	log.Debug("Fetching DigitalOcean block storage pricing")
+
+	_, err := do.fetchPricingData()
+	if err != nil {
+		log.Warnf("Failed to fetch PV pricing data: %v, using fallback", err)
+		return fallbackPV(key)
+	}
+
+	products, ok := do.Products["K8S_VOLUME"]
+	if !ok || len(products) == 0 {
+		log.Warn("No 'K8S_VOLUME' product found in catalog, using fallback")
+		return fallbackPV(key)
+	}
+
+	product := products[0]
+	if len(product.Prices) == 0 {
+		log.Warn("No pricing info found for K8S_VOLUME, using fallback")
+		return fallbackPV(key)
+	}
+
+	price := product.Prices[0]
+	if price.Unit != "GIB_PER_HOUR" {
+		log.Warnf("Unsupported PV price unit: %s, expected GIB_PER_HOUR. Using fallback.", price.Unit)
+		return fallbackPV(key)
+	}
+
+	rate, err := strconv.ParseFloat(price.Rate, 64)
+	if err != nil {
+		log.Warnf("Failed to parse PV rate: %v, using fallback", err)
+		return fallbackPV(key)
+	}
+
+	k, ok := key.(*doksPVKey)
+	var sizeGB int64
+	if ok {
+		sizeGB = k.SizeGiB()
+	}
+
+	return &models.PV{
+		Cost:       fmt.Sprintf("%.5f", rate),
+		CostPerIO:  "0",
+		Class:      key.GetStorageClass(),
+		Size:       fmt.Sprintf("%d", sizeGB),
+		Region:     price.Region,
+		ProviderID: key.ID(),
+		Parameters: nil,
+	}, nil
+}
+
+func fallbackPV(key models.PVKey) (*models.PV, error) {
+	k, ok := key.(*doksPVKey)
+	var sizeGB int64
+	if ok {
+		sizeGB = k.SizeGiB()
+	}
+
+	region := "global"
+	if ok && k.region != "" {
+		region = k.region
+	}
+
+	log.Infof("Using fallback PV pricing: %.5f USD/GiB/hr | Class=%s | SizeGiB=%d | Region=%s | ID=%s",
+		fallbackPVHourlyRate, key.GetStorageClass(), sizeGB, region, key.ID())
+
+	return &models.PV{
+		Cost:       fmt.Sprintf("%.5f", fallbackPVHourlyRate),
+		CostPerIO:  "0",
+		Class:      key.GetStorageClass(),
+		Size:       fmt.Sprintf("%d", sizeGB),
+		Region:     region,
+		ProviderID: key.ID(),
+		Parameters: nil,
+	}, nil
+}
+
+// LoadBalancerPricing returns the hourly cost of a Load Balancer in DigitalOcean (DOKS).
+//
+// DigitalOcean offers multiple Load Balancers with different prices:
+//
+// - Public HTTP Load Balancer:           ~$0.01786/hr
+// - Private Network Load Balancer:      ~$0.02232/hr
+// - Public Network Load Balancer:       ~$0.02232/hr
+// - Statically sized Load Balancers:    $0.01786–$0.10714/hr
+//
+// However, the current OpenCost provider interface does not pass information about
+// individual Load Balancer characteristics (like annotations or network mode).
+//
+// As a result, this implementation uses a fixed average hourly rate of $0.02,
+// which is representative of the most common DO LBs.
+//
+// TODO Once the provider interface supports more granular Load Balancer metadata,
+// this method should be updated to assign costs more precisely.
+func (do *DOKS) LoadBalancerPricing() (*models.LoadBalancer, error) {
+	hourlyCost := 0.02
+	return &models.LoadBalancer{
+		Cost: hourlyCost,
+	}, nil
+}
+
+func (do *DOKS) NetworkPricing() (*models.Network, error) {
+	// fallback
+	const (
+		defaultZoneEgress     = 0.00
+		defaultRegionEgress   = 0.00
+		defaultInternetEgress = 0.01
+	)
+
+	log.Infof("NetworkPricing: retrieving custom pricing data")
+	cpricing, err := do.GetConfig()
+	if err != nil || isDefaultNetworkPricing(cpricing) {
+		log.Warnf("NetworkPricing: failed to load custom pricing data: %v", err)
+		log.Infof("NetworkPricing: using fallback network prices: zone=%.4f, region=%.4f, internet=%.4f",
+			defaultZoneEgress, defaultRegionEgress, defaultInternetEgress)
+		return &models.Network{
+			ZoneNetworkEgressCost:     defaultZoneEgress,
+			RegionNetworkEgressCost:   defaultRegionEgress,
+			InternetNetworkEgressCost: defaultInternetEgress,
+		}, nil
+	}
+
+	znec := parseWithDefault(cpricing.ZoneNetworkEgress, defaultZoneEgress, "ZoneNetworkEgress")
+	rnec := parseWithDefault(cpricing.RegionNetworkEgress, defaultRegionEgress, "RegionNetworkEgress")
+	inec := parseWithDefault(cpricing.InternetNetworkEgress, defaultInternetEgress, "InternetNetworkEgress")
+
+	log.Infof("NetworkPricing: using parsed values: zone=%.4f/GiB, region=%.4f/GiB, internet=%.4f/GIB", znec, rnec, inec)
+
+	return &models.Network{
+		ZoneNetworkEgressCost:     znec,
+		RegionNetworkEgressCost:   rnec,
+		InternetNetworkEgressCost: inec,
+	}, nil
+}
+
+func parseWithDefault(val string, fallback float64, label string) float64 {
+	if val == "" {
+		log.Warnf("NetworkPricing: missing value for %s, using fallback %.4f", label, fallback)
+		return fallback
+	}
+	parsed, err := strconv.ParseFloat(val, 64)
+	if err != nil {
+		log.Warnf("NetworkPricing: failed to parse %s='%s', using fallback %.4f", label, val, fallback)
+		return fallback
+	}
+	return parsed
+}
+
+func isDefaultNetworkPricing(cp *models.CustomPricing) bool {
+	return cp != nil &&
+		cp.ZoneNetworkEgress == "0.01" &&
+		cp.RegionNetworkEgress == "0.01" &&
+		cp.InternetNetworkEgress == "0.12"
+}
+
+func (do *DOKS) AllNodePricing() (interface{}, error) {
+	_, _ = do.fetchPricingData()
+	return do.Cache, nil
+}
+
+func (do *DOKS) AllPVPricing() (map[models.PVKey]*models.PV, error) {
+	_, err := do.fetchPricingData()
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch pricing data: %w", err)
+	}
+
+	products, ok := do.Products["K8S_VOLUME"]
+	if !ok || len(products) == 0 {
+		return nil, fmt.Errorf("no PV products found")
+	}
+
+	// Only one PV product
+	product := products[0]
+	key := &doksPVKey{
+		id:           product.SKU,
+		storageClass: "do-block-storage",
+	}
+
+	pv, err := do.PVPricing(key)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get PV pricing: %w", err)
+	}
+
+	return map[models.PVKey]*models.PV{
+		key: pv,
+	}, nil
+}
+
+func (do *DOKS) GetPVKey(pv *clustercache.PersistentVolume, parameters map[string]string, defaultRegion string) models.PVKey {
+	var storageClass string
+	if pv.Spec.StorageClassName != "" {
+		storageClass = pv.Spec.StorageClassName
+	}
+
+	var volumeHandle string
+	if pv.Spec.CSI != nil {
+		volumeHandle = pv.Spec.CSI.VolumeHandle
+	}
+
+	sizeBytes := pv.Spec.Capacity.Storage().Value()
+
+	// Region is in node affinity
+	region := defaultRegion
+	if pv.Spec.NodeAffinity != nil && pv.Spec.NodeAffinity.Required != nil {
+		for _, term := range pv.Spec.NodeAffinity.Required.NodeSelectorTerms {
+			for _, expr := range term.MatchExpressions {
+				if expr.Key == "region" && len(expr.Values) > 0 {
+					region = expr.Values[0]
+					break
+				}
+			}
+		}
+	}
+
+	return &doksPVKey{
+		id:           pv.Name,
+		storageClass: storageClass,
+		sizeBytes:    sizeBytes,
+		ProviderID:   volumeHandle,
+		region:       region,
+	}
+}
+
+func (do *DOKS) ClusterInfo() (map[string]string, error) {
+	return map[string]string{"provider": "digitalocean", "platform": "doks"}, nil
+}
+
+func (do *DOKS) GetAddresses() ([]byte, error) {
+	return nil, nil
+}
+
+func (do *DOKS) GetDisks() ([]byte, error) {
+	return nil, nil
+}
+
+func (do *DOKS) GetOrphanedResources() ([]models.OrphanedResource, error) {
+	return nil, nil
+}
+
+func (do *DOKS) GpuPricing(input map[string]string) (string, error) {
+	return "", nil
+}
+
+func (do *DOKS) DownloadPricingData() error {
+	_, err := do.fetchPricingData()
+	return err
+}
+
+func (do *DOKS) UpdateConfig(r io.Reader, updateType string) (*models.CustomPricing, error) {
+	return nil, nil
+}
+
+func (do *DOKS) UpdateConfigFromConfigMap(map[string]string) (*models.CustomPricing, error) {
+	return nil, nil
+}
+
+func (do *DOKS) GetConfig() (*models.CustomPricing, error) {
+	if do.Config == nil {
+		log.Errorf("DOKS: ProviderConfig is nil")
+		return nil, fmt.Errorf("provider config not available")
+	}
+
+	customPricing, err := do.Config.GetCustomPricingData()
+	if err != nil {
+		log.Errorf("DOKS: failed to get custom pricing data: %v", err)
+		return nil, err
+	}
+	return customPricing, nil
+}
+
+func (do *DOKS) GetManagementPlatform() (string, error) {
+	return "DOKS", nil
+}
+
+func (do *DOKS) ApplyReservedInstancePricing(map[string]*models.Node) {}
+
+func (do *DOKS) ServiceAccountStatus() *models.ServiceAccountStatus {
+	return &models.ServiceAccountStatus{}
+}
+
+func (do *DOKS) PricingSourceStatus() map[string]*models.PricingSource {
+	return map[string]*models.PricingSource{}
+}
+
+func (do *DOKS) ClusterManagementPricing() (string, float64, error) {
+	return "", 0, nil
+}
+
+func (do *DOKS) CombinedDiscountForNode(string, bool, float64, float64) float64 {
+	return 0
+}
+
+func (do *DOKS) Regions() []string {
+	return []string{"nyc1", "sfo3", "ams3"}
+}
+
+func (do *DOKS) PricingSourceSummary() interface{} {
+	return nil
+}
+
+func (do *DOKS) GetClusterManagementPricing() float64 {
+	return do.ClusterManagementCost
+}
+
+func (do *DOKS) CustomPricingEnabled() bool {
+	return false
+}

+ 568 - 0
pkg/cloud/digitalocean/provider_test.go

@@ -0,0 +1,568 @@
+package digitalocean
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"testing"
+
+	"github.com/opencost/opencost/pkg/cloud/models"
+)
+
+func newTestProviderWithFile(t *testing.T, filename string) (*DOKS, func() int) {
+	t.Helper()
+
+	data, err := os.ReadFile(filename)
+	if err != nil {
+		t.Fatalf("Failed to read file: %v", err)
+	}
+
+	var count int
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		count++
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write(data)
+	}))
+
+	t.Cleanup(server.Close)
+
+	provider := NewDOKSProvider(server.URL)
+	return provider, func() int { return count }
+}
+
+func newTestProviderWith404(t *testing.T) *DOKS {
+	t.Helper()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusNotFound)
+	}))
+
+	t.Cleanup(server.Close)
+
+	provider := NewDOKSProvider(server.URL)
+	return provider
+}
+
+func TestNodePricing_APIMatches(t *testing.T) {
+	provider, callCount := newTestProviderWithFile(t, "testdata/do_pricing.json")
+
+	key := &doksKey{
+		Labels: map[string]string{
+			"node.kubernetes.io/instance-type": "s-1vcpu-2gb",
+			"kubernetes.io/arch":               "amd64",
+		},
+	}
+
+	node, meta, err := provider.NodePricing(key)
+	if err != nil {
+		t.Fatalf("expected no error, got: %v", err)
+	}
+
+	if node == nil {
+		t.Fatal("expected node pricing, got nil")
+	}
+
+	assertEqual := func(name, got, want string) {
+		if got != want {
+			t.Errorf("%s: got %s, want %s", name, got, want)
+		}
+	}
+
+	assertEqual("Cost", node.Cost, "0.01199")
+	assertEqual("VCPUCost", node.VCPUCost, "0.00400") // 1/3
+	assertEqual("RAMCost", node.RAMCost, "0.00799")   // 2/3
+	assertEqual("VCPU", node.VCPU, "1")
+	assertEqual("RAM", node.RAM, "2GiB")
+	assertEqual("ArchType", node.ArchType, "amd64")
+	assertEqual("PricingType", string(node.PricingType), string(models.DefaultPrices))
+
+	if meta.Source != "digitalocean" {
+		t.Errorf("expected metadata source to be digitalocean, got: %s", meta.Source)
+	}
+
+	if c := callCount(); c != 1 {
+		t.Errorf("expected 1 API call, got %d", c)
+	}
+}
+
+func TestNodePricing_Fallback(t *testing.T) {
+	provider, callCount := newTestProviderWithFile(t, "testdata/do_pricing.json")
+
+	key := &doksKey{
+		Labels: map[string]string{
+			"node.kubernetes.io/instance-type": "s-2vcpu-4gb",
+			"kubernetes.io/arch":               "amd64",
+		},
+	}
+
+	node, meta, err := provider.NodePricing(key)
+	if err != nil {
+		t.Fatalf("expected no error, got: %v", err)
+	}
+
+	if node == nil {
+		t.Fatal("expected node pricing, got nil")
+	}
+
+	assertEqual := func(name, got, want string) {
+		if got != want {
+			t.Errorf("%s: got %s, want %s", name, got, want)
+		}
+	}
+
+	assertEqual("Cost", node.Cost, "0.03571")
+	assertEqual("VCPUCost", node.VCPUCost, "0.00595")
+	assertEqual("RAMCost", node.RAMCost, "0.00595")
+	assertEqual("VCPU", node.VCPU, "2")
+	assertEqual("RAM", node.RAM, "4GiB")
+	assertEqual("ArchType", node.ArchType, "amd64")
+	assertEqual("PricingType", string(node.PricingType), string(models.DefaultPrices))
+
+	if meta.Source != "static-fallback" {
+		t.Errorf("expected metadata source to be static-fallback, got: %s", meta.Source)
+	}
+
+	if c := callCount(); c != 1 {
+		t.Errorf("expected 1 API call, got %d", c)
+	}
+}
+
+func TestNodePricing_Estimation_C8Intel(t *testing.T) {
+	provider := newTestProviderWith404(t)
+
+	key := &doksKey{
+		Labels: map[string]string{
+			"node.kubernetes.io/instance-type": "c-8-intel",
+			"kubernetes.io/arch":               "amd64",
+		},
+	}
+
+	node, meta, err := provider.NodePricing(key)
+	if err != nil {
+		t.Fatalf("expected no error, got: %v", err)
+	}
+
+	expectedCost := "0.32440"
+	expectedVCPUCost := "0.01352"
+	expectedRAMCost := "0.01352"
+
+	if node.Cost != expectedCost {
+		t.Errorf("Cost: got %s, want %s", node.Cost, expectedCost)
+	}
+	if node.VCPUCost != expectedVCPUCost {
+		t.Errorf("VCPUCost: got %s, want %s", node.VCPUCost, expectedVCPUCost)
+	}
+	if node.RAMCost != expectedRAMCost {
+		t.Errorf("RAMCost: got %s, want %s", node.RAMCost, expectedRAMCost)
+	}
+	if node.VCPU != "8" {
+		t.Errorf("VCPU: got %s, want 8", node.VCPU)
+	}
+	if node.RAM != "16GiB" {
+		t.Errorf("RAM: got %s, want 16GiB", node.RAM)
+	}
+	if meta.Source != "static-fallback" {
+		t.Errorf("expected metadata source to be estimated, got: %s", meta.Source)
+	}
+}
+
+func TestNodePricing_EstimationFromSlug(t *testing.T) {
+	tests := []struct {
+		name            string
+		slug            string
+		expectedVCPU    string
+		expectedRAM     string
+		expectedCost    string
+		expectedCPU     string
+		expectedRAMCost string
+	}{
+		{
+			name:            "s-4vcpu-8gb",
+			slug:            "s-4vcpu-8gb",
+			expectedVCPU:    "4",
+			expectedRAM:     "8GiB",
+			expectedCost:    "0.07143",
+			expectedCPU:     "0.00595",
+			expectedRAMCost: "0.00595",
+		},
+		{
+			name:            "m-8vcpu-64gb",
+			slug:            "m-8vcpu-64gb",
+			expectedVCPU:    "8",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.50000",
+			expectedCPU:     "0.00694",
+			expectedRAMCost: "0.00694",
+		},
+		{
+			name:            "g-4vcpu-16gb-intel",
+			slug:            "g-4vcpu-16gb-intel",
+			expectedVCPU:    "4",
+			expectedRAM:     "16GiB",
+			expectedCost:    "0.22470",
+			expectedCPU:     "0.01124",
+			expectedRAMCost: "0.01124",
+		},
+	}
+
+	provider := newTestProviderWith404(t) // Force fallback/estimate
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			key := &doksKey{
+				Labels: map[string]string{
+					"node.kubernetes.io/instance-type": tc.slug,
+					"kubernetes.io/arch":               "amd64",
+				},
+			}
+
+			node, meta, err := provider.NodePricing(key)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+
+			if node == nil {
+				t.Fatal("expected node to be non-nil")
+			}
+
+			assertEqual := func(field, got, want string) {
+				if got != want {
+					t.Errorf("%s: got %s, want %s", field, got, want)
+				}
+			}
+
+			assertEqual("Cost", node.Cost, tc.expectedCost)
+			assertEqual("VCPUCost", node.VCPUCost, tc.expectedCPU)
+			assertEqual("RAMCost", node.RAMCost, tc.expectedRAMCost)
+			assertEqual("VCPU", node.VCPU, tc.expectedVCPU)
+			assertEqual("RAM", node.RAM, tc.expectedRAM)
+			assertEqual("ArchType", node.ArchType, "amd64")
+
+			if meta.Source != "static-fallback" {
+				t.Errorf("expected metadata source to be 'estimated', got: %s", meta.Source)
+			}
+		})
+	}
+}
+
+func TestNodePricing_Estimation_BaseSlugs(t *testing.T) {
+	tests := []struct {
+		name            string
+		slug            string
+		expectedVCPU    string
+		expectedRAM     string
+		expectedCost    string
+		expectedCPU     string
+		expectedRAMCost string
+	}{
+		{
+			name:            "c-8-intel",
+			slug:            "c-8-intel",
+			expectedVCPU:    "8",
+			expectedRAM:     "16GiB",
+			expectedCost:    "0.32440",
+			expectedCPU:     "0.01352",
+			expectedRAMCost: "0.01352",
+		},
+		{
+			name:            "s-2vcpu-4gb",
+			slug:            "s-2vcpu-4gb",
+			expectedVCPU:    "2",
+			expectedRAM:     "4GiB",
+			expectedCost:    "0.03571",
+			expectedCPU:     "0.00595",
+			expectedRAMCost: "0.00595",
+		},
+		{
+			name:            "m-4vcpu-32gb",
+			slug:            "m-4vcpu-32gb",
+			expectedVCPU:    "4",
+			expectedRAM:     "32GiB",
+			expectedCost:    "0.25000",
+			expectedCPU:     "0.00694",
+			expectedRAMCost: "0.00694",
+		},
+		{
+			name:            "g-16vcpu-64gb-intel",
+			slug:            "g-16vcpu-64gb-intel",
+			expectedVCPU:    "16",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.89880",
+			expectedCPU:     "0.01124",
+			expectedRAMCost: "0.01124",
+		},
+	}
+
+	provider := newTestProviderWith404(t) // ensures fallback path is tested
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			key := &doksKey{
+				Labels: map[string]string{
+					"node.kubernetes.io/instance-type": tc.slug,
+					"kubernetes.io/arch":               "amd64",
+				},
+			}
+
+			node, meta, err := provider.NodePricing(key)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+
+			if node == nil {
+				t.Fatal("expected node to be non-nil")
+			}
+
+			assertEqual := func(field, got, want string) {
+				if got != want {
+					t.Errorf("%s: got %s, want %s", field, got, want)
+				}
+			}
+
+			assertEqual("Cost", node.Cost, tc.expectedCost)
+			assertEqual("VCPUCost", node.VCPUCost, tc.expectedCPU)
+			assertEqual("RAMCost", node.RAMCost, tc.expectedRAMCost)
+			assertEqual("VCPU", node.VCPU, tc.expectedVCPU)
+			assertEqual("RAM", node.RAM, tc.expectedRAM)
+			assertEqual("ArchType", node.ArchType, "amd64")
+
+			if meta.Source != "static-fallback" {
+				t.Errorf("expected metadata source to be 'static-fallback', got: %s", meta.Source)
+			}
+		})
+	}
+}
+
+func TestNodePricing_Estimation_FamilySeeds(t *testing.T) {
+	tests := []struct {
+		name            string
+		slug            string
+		expectedVCPU    string
+		expectedRAM     string
+		expectedCost    string
+		expectedCPU     string
+		expectedRAMCost string
+	}{
+		{
+			name:            "c-16",
+			slug:            "c-16",
+			expectedVCPU:    "16",
+			expectedRAM:     "32GiB",
+			expectedCost:    "0.50000",
+			expectedCPU:     "0.01042",
+			expectedRAMCost: "0.01042",
+		},
+		{
+			name:            "c-16-intel",
+			slug:            "c-16-intel",
+			expectedVCPU:    "16",
+			expectedRAM:     "32GiB",
+			expectedCost:    "0.64880",
+			expectedCPU:     "0.01352",
+			expectedRAMCost: "0.01352",
+		},
+
+		{
+			name:            "c2-8vcpu-16gb",
+			slug:            "c2-8vcpu-16gb",
+			expectedVCPU:    "8",
+			expectedRAM:     "16GiB",
+			expectedCost:    "0.27976",
+			expectedCPU:     "0.01166",
+			expectedRAMCost: "0.01166",
+		},
+		{
+			name:            "c2-8vcpu-16gb-intel",
+			slug:            "c2-8vcpu-16gb-intel",
+			expectedVCPU:    "8",
+			expectedRAM:     "16GiB",
+			expectedCost:    "0.36310",
+			expectedCPU:     "0.01513",
+			expectedRAMCost: "0.01513",
+		},
+		{
+			name:            "g-8vcpu-32gb",
+			slug:            "g-8vcpu-32gb",
+			expectedVCPU:    "8",
+			expectedRAM:     "32GiB",
+			expectedCost:    "0.37500",
+			expectedCPU:     "0.00937",
+			expectedRAMCost: "0.00937",
+		},
+		{
+			name:            "g-8vcpu-32gb-intel",
+			slug:            "g-8vcpu-32gb-intel",
+			expectedVCPU:    "8",
+			expectedRAM:     "32GiB",
+			expectedCost:    "0.44940",
+			expectedCPU:     "0.01124",
+			expectedRAMCost: "0.01124",
+		},
+		{
+			name:            "gd-40vcpu-160gb",
+			slug:            "gd-40vcpu-160gb",
+			expectedVCPU:    "40",
+			expectedRAM:     "160GiB",
+			expectedCost:    "2.02380",
+			expectedCPU:     "0.01012",
+			expectedRAMCost: "0.01012",
+		},
+		{
+			name:            "gd-16vcpu-64gb-intel",
+			slug:            "gd-16vcpu-64gb-intel",
+			expectedVCPU:    "16",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.94048",
+			expectedCPU:     "0.01176",
+			expectedRAMCost: "0.01176",
+		},
+		{
+			name:            "m-16vcpu-128gb",
+			slug:            "m-16vcpu-128gb",
+			expectedVCPU:    "16",
+			expectedRAM:     "128GiB",
+			expectedCost:    "1.00000",
+			expectedCPU:     "0.00694",
+			expectedRAMCost: "0.00694",
+		},
+		{
+			name:            "m-16vcpu-128gb-intel",
+			slug:            "m-16vcpu-128gb-intel",
+			expectedVCPU:    "16",
+			expectedRAM:     "128GiB",
+			expectedCost:    "1.17858",
+			expectedCPU:     "0.00818",
+			expectedRAMCost: "0.00818",
+		},
+
+		// m3
+		{
+			name:            "m3-8vcpu-64gb",
+			slug:            "m3-8vcpu-64gb",
+			expectedVCPU:    "8",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.61905",
+			expectedCPU:     "0.00860",
+			expectedRAMCost: "0.00860",
+		},
+		{
+			name:            "m3-32vcpu-256gb-intel",
+			slug:            "m3-32vcpu-256gb-intel",
+			expectedVCPU:    "32",
+			expectedRAM:     "256GiB",
+			expectedCost:    "2.61904",
+			expectedCPU:     "0.00909",
+			expectedRAMCost: "0.00909",
+		},
+		{
+			name:            "m6-8vcpu-64gb",
+			slug:            "m6-8vcpu-64gb",
+			expectedVCPU:    "8",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.77976",
+			expectedCPU:     "0.01083",
+			expectedRAMCost: "0.01083",
+		},
+		{
+			name:            "m6-24vcpu-192gb",
+			slug:            "m6-24vcpu-192gb",
+			expectedVCPU:    "24",
+			expectedRAM:     "192GiB",
+			expectedCost:    "2.33928",
+			expectedCPU:     "0.01083",
+			expectedRAMCost: "0.01083",
+		},
+		{
+			name:            "s-1vcpu-2gb",
+			slug:            "s-1vcpu-2gb",
+			expectedVCPU:    "1",
+			expectedRAM:     "2GiB",
+			expectedCost:    "0.01786",
+			expectedCPU:     "0.00595",
+			expectedRAMCost: "0.00595",
+		},
+		{
+			name:            "s-8vcpu-16gb-intel",
+			slug:            "s-8vcpu-16gb-intel",
+			expectedVCPU:    "8",
+			expectedRAM:     "16GiB",
+			expectedCost:    "0.16666",
+			expectedCPU:     "0.00694",
+			expectedRAMCost: "0.00694",
+		},
+		{
+			name:            "so-8vcpu-64gb",
+			slug:            "so-8vcpu-64gb",
+			expectedVCPU:    "8",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.77976",
+			expectedCPU:     "0.01083",
+			expectedRAMCost: "0.01083",
+		},
+		{
+			name:            "so-8vcpu-64gb-intel",
+			slug:            "so-8vcpu-64gb-intel",
+			expectedVCPU:    "8",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.77976",
+			expectedCPU:     "0.01083",
+			expectedRAMCost: "0.01083",
+		},
+		{
+			name:            "so1_5-8vcpu-64gb",
+			slug:            "so1_5-8vcpu-64gb",
+			expectedVCPU:    "8",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.97024",
+			expectedCPU:     "0.01348",
+			expectedRAMCost: "0.01348",
+		},
+		{
+			name:            "so1_5-8vcpu-64gb-intel",
+			slug:            "so1_5-8vcpu-64gb-intel",
+			expectedVCPU:    "8",
+			expectedRAM:     "64GiB",
+			expectedCost:    "0.82738",
+			expectedCPU:     "0.01149",
+			expectedRAMCost: "0.01149",
+		},
+	}
+
+	provider := newTestProviderWith404(t)
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			key := &doksKey{
+				Labels: map[string]string{
+					"node.kubernetes.io/instance-type": tc.slug,
+					"kubernetes.io/arch":               "amd64",
+				},
+			}
+
+			node, meta, err := provider.NodePricing(key)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if node == nil {
+				t.Fatal("expected node to be non-nil")
+			}
+
+			assertEqual := func(field, got, want string) {
+				if got != want {
+					t.Errorf("%s: got %s, want %s", field, got, want)
+				}
+			}
+
+			assertEqual("Cost", node.Cost, tc.expectedCost)
+			assertEqual("VCPUCost", node.VCPUCost, tc.expectedCPU)
+			assertEqual("RAMCost", node.RAMCost, tc.expectedRAMCost)
+			assertEqual("VCPU", node.VCPU, tc.expectedVCPU)
+			assertEqual("RAM", node.RAM, tc.expectedRAM)
+			assertEqual("ArchType", node.ArchType, "amd64")
+
+			if meta.Source != "static-fallback" {
+				t.Errorf("expected metadata source to be 'static-fallback', got: %s", meta.Source)
+			}
+		})
+	}
+}

+ 30 - 0
pkg/cloud/digitalocean/testdata/do_pricing.json

@@ -0,0 +1,30 @@
+{
+  "products": [
+    {
+      "sku": "1-KS-K8SWN-00123",
+      "itemType": "K8S_WORKER_NODE",
+      "displayName": "Kubernetes Worker Node, General Purpose Droplets - 1 VCPU 2GB RAM",
+      "category": "IAAS",
+      "prices": [
+        {
+          "unit": "ITEM_PER_SECOND",
+          "rate": "0.00000333",
+          "currency": "USD",
+          "region": "global",
+          "minAmount": "0.01",
+          "maxAmount": "10.00",
+          "minUsage": "60",
+          "maxUsage": "2419200"
+        }
+      ],
+      "attributes": [
+        {
+          "name": "size_id",
+          "value": "184",
+          "unit": "NO_UNIT"
+        }
+      ],
+      "effectiveAt": "2023-09-08T00:00:00Z"
+    }
+  ]
+}

+ 7 - 1
pkg/cloud/gcp/provider.go

@@ -735,6 +735,10 @@ func (gcp *GCP) parsePage(r io.Reader, inputKeys map[string]models.Key, pvKeys m
 					}
 				}
 
+				if (instanceType == "ram" || instanceType == "cpu") && strings.Contains(strings.ToUpper(product.Description), "N4 INSTANCE") {
+					instanceType = "n4standard"
+				}
+
 				if (instanceType == "ram" || instanceType == "cpu") && strings.Contains(strings.ToUpper(product.Description), "A2 INSTANCE") {
 					instanceType = "a2"
 				}
@@ -1501,6 +1505,8 @@ func parseGCPInstanceTypeLabel(it string) string {
 			instanceType = "n1standard" // These are priced the same. TODO: support n1ultrahighmem
 		} else if instanceType == "n2highmem" || instanceType == "n2highcpu" {
 			instanceType = "n2standard"
+		} else if instanceType == "n4highmem" || instanceType == "n4highcpu" {
+			instanceType = "n4standard" // N4 variants are priced the same per vCPU and RAM
 		} else if instanceType == "e2highmem" || instanceType == "e2highcpu" {
 			instanceType = "e2standard"
 		} else if instanceType == "n2dhighmem" || instanceType == "n2dhighcpu" {
@@ -1642,7 +1648,7 @@ func sustainedUseDiscount(class string, defaultDiscount float64, isPreemptible b
 	}
 	discount := defaultDiscount
 	switch class {
-	case "e2", "f1", "g1":
+	case "e2", "f1", "g1", "n4":
 		discount = 0.0
 	case "n2", "n2d":
 		discount = 0.2

+ 12 - 0
pkg/cloud/gcp/provider_test.go

@@ -36,6 +36,18 @@ func TestParseGCPInstanceTypeLabel(t *testing.T) {
 			input:    "n2d-highmem-8",
 			expected: "n2dstandard",
 		},
+		{
+			input:    "n4-standard-4",
+			expected: "n4standard",
+		},
+		{
+			input:    "n4-highcpu-8",
+			expected: "n4standard",
+		},
+		{
+			input:    "n4-highmem-16",
+			expected: "n4standard",
+		},
 	}
 
 	for _, test := range cases {

+ 1 - 0
pkg/cloud/models/models.go

@@ -163,6 +163,7 @@ type CustomPricing struct {
 	AthenaTable                  string `json:"athenaTable"`
 	AthenaWorkgroup              string `json:"athenaWorkgroup"`
 	MasterPayerARN               string `json:"masterPayerARN"`
+	AthenaCURVersion             string `json:"athenaCURVersion,omitempty"` // "1.0" or "2.0", defaults to "2.0"
 	BillingDataDataset           string `json:"billingDataDataset,omitempty"`
 	CustomPricesEnabled          string `json:"customPricesEnabled"`
 	DefaultIdle                  string `json:"defaultIdle"`

+ 32 - 0
pkg/cloud/models/models_test.go

@@ -116,3 +116,35 @@ func TestSetSetCustomPricingField(t *testing.T) {
 		})
 	}
 }
+
+func TestCustomPricing_AthenaCURVersion(t *testing.T) {
+	testCases := map[string]struct {
+		curVersion string
+		expected   string
+	}{
+		"CUR version 1.0": {
+			curVersion: "1.0",
+			expected:   "1.0",
+		},
+		"CUR version 2.0": {
+			curVersion: "2.0",
+			expected:   "2.0",
+		},
+		"empty CUR version": {
+			curVersion: "",
+			expected:   "",
+		},
+	}
+
+	for name, testCase := range testCases {
+		t.Run(name, func(t *testing.T) {
+			cp := &CustomPricing{
+				AthenaCURVersion: testCase.curVersion,
+			}
+			
+			if cp.AthenaCURVersion != testCase.expected {
+				t.Errorf("expected AthenaCURVersion to be '%s', got '%s'", testCase.expected, cp.AthenaCURVersion)
+			}
+		})
+	}
+}

+ 14 - 0
pkg/cloud/provider/provider.go

@@ -15,6 +15,7 @@ import (
 	"github.com/opencost/opencost/pkg/cloud/alibaba"
 	"github.com/opencost/opencost/pkg/cloud/aws"
 	"github.com/opencost/opencost/pkg/cloud/azure"
+	"github.com/opencost/opencost/pkg/cloud/digitalocean"
 	"github.com/opencost/opencost/pkg/cloud/gcp"
 	"github.com/opencost/opencost/pkg/cloud/models"
 	"github.com/opencost/opencost/pkg/cloud/oracle"
@@ -215,6 +216,15 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string, config *config.
 			Config:        NewProviderConfig(config, cp.configFileName),
 			ClusterRegion: cp.region,
 		}, nil
+	case opencost.DigitalOceanProvider:
+		log.Info("Detected DigitalOcean, using DOKS")
+		return &digitalocean.DOKS{
+			Config:                NewProviderConfig(config, cp.configFileName),
+			Cache:                 digitalocean.NewPricingCache(),
+			Products:              make(map[string][]digitalocean.DOProduct),
+			Clientset:             cache,
+			ClusterManagementCost: 0.0,
+		}, nil
 	default:
 		log.Info("Unsupported provider, falling back to default")
 		return &CustomProvider{
@@ -290,6 +300,10 @@ func getClusterProperties(node *clustercache.Node) clusterProperties {
 		log.Debug("using OTC provider")
 		cp.provider = opencost.OTCProvider
 		cp.configFileName = "otc.json"
+	} else if strings.HasPrefix(providerID, "digitalocean") {
+		log.Debug("using DigitalOcean provider")
+		cp.provider = opencost.DigitalOceanProvider
+		cp.configFileName = "digitalocean.json"
 	}
 	// Override provider to CSV if CSVProvider is used and custom provider is not set
 	if env.IsUseCSVProvider() {

+ 1 - 1
pkg/cloudcost/ingestor.go

@@ -39,7 +39,7 @@ type IngestorConfig struct {
 func DefaultIngestorConfiguration() IngestorConfig {
 	return IngestorConfig{
 		Resolution:             timeutil.Day,
-		Duration:               timeutil.Day * time.Duration(env.GetDataRetentionDailyResolutionDays()),
+		Duration:               timeutil.Day * time.Duration(env.GetCloudCost1dRetention()),
 		MonthToDateRunInterval: env.GetCloudCostMonthToDateInterval(),
 		RefreshRate:            time.Hour * time.Duration(env.GetCloudCostRefreshRateHours()),
 		QueryWindow:            timeutil.Day * time.Duration(env.GetCloudCostQueryWindowDays()),

+ 64 - 0
pkg/currency/README.md

@@ -0,0 +1,64 @@
+# Currency Package
+
+Convert costs between currencies in OpenCost using live exchange rates. This package provides a reusable currency conversion utility for OpenCost components and plugins.
+
+## Quick Start
+
+```go
+import "github.com/opencost/opencost/pkg/currency"
+
+config := currency.Config{
+    APIKey:   "your-api-key",
+    CacheTTL: 24 * time.Hour,
+}
+
+converter, err := currency.NewConverter(config)
+if err != nil {
+    log.Fatal(err)
+}
+
+// Convert 100 USD to EUR
+amount, err := converter.Convert(100.0, "USD", "EUR")
+```
+
+## Setup
+
+Get a free API key from [exchangerate-api.com](https://www.exchangerate-api.com/) (1,500 requests/month).
+
+## How it Works
+
+The package fetches exchange rates and caches them for 24 hours. This keeps API usage low - most plugins use under 50 requests per month.
+
+Supports all ISO 4217 currencies (161 total). Thread-safe with automatic cache cleanup.
+
+## Example Usage in Plugins
+
+```go
+// Plugin config
+type PluginConfig struct {
+    TargetCurrency  string `json:"target_currency"`
+    ExchangeAPIKey  string `json:"exchange_api_key"`
+}
+
+// Initialize converter
+if config.ExchangeAPIKey != "" {
+    converter, _ := currency.NewConverter(currency.Config{
+        APIKey:   config.ExchangeAPIKey,
+        CacheTTL: 24 * time.Hour,
+    })
+}
+
+// Convert costs
+if converter != nil {
+    cost, _ = converter.Convert(cost, "USD", targetCurrency)
+}
+```
+
+## Testing
+
+```bash
+cd pkg/currency
+go test -v
+```
+
+Tests use mocks - no API calls needed.

+ 99 - 0
pkg/currency/cache.go

@@ -0,0 +1,99 @@
+package currency
+
+import (
+	"sync"
+	"time"
+)
+
+type memoryCache struct {
+	mu      sync.RWMutex
+	data    map[string]*cachedRates
+	ttl     time.Duration
+	janitor *time.Ticker
+}
+
+func newMemoryCache(ttl time.Duration) *memoryCache {
+	if ttl == 0 {
+		ttl = 24 * time.Hour
+	}
+
+	cache := &memoryCache{
+		data:    make(map[string]*cachedRates),
+		ttl:     ttl,
+		janitor: time.NewTicker(ttl / 2),
+	}
+
+	go cache.cleanup()
+
+	return cache
+}
+
+func (c *memoryCache) get(baseCurrency string) (*cachedRates, bool) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	rates, exists := c.data[baseCurrency]
+	if !exists {
+		return nil, false
+	}
+
+	if time.Now().After(rates.validUntil) {
+		return nil, false
+	}
+
+	return rates, true
+}
+
+func (c *memoryCache) set(baseCurrency string, rates *cachedRates) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	rates.validUntil = rates.fetchedAt.Add(c.ttl)
+	c.data[baseCurrency] = rates
+}
+
+func (c *memoryCache) clear() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	c.data = make(map[string]*cachedRates)
+}
+
+func (c *memoryCache) cleanup() {
+	for range c.janitor.C {
+		c.removeExpired()
+	}
+}
+
+func (c *memoryCache) removeExpired() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	now := time.Now()
+	for key, rates := range c.data {
+		if now.After(rates.validUntil) {
+			delete(c.data, key)
+		}
+	}
+}
+
+func (c *memoryCache) stop() {
+	if c.janitor != nil {
+		c.janitor.Stop()
+	}
+}
+
+func (c *memoryCache) stats() (entries int, oldestEntry time.Time) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	entries = len(c.data)
+
+	for _, rates := range c.data {
+		if oldestEntry.IsZero() || rates.fetchedAt.Before(oldestEntry) {
+			oldestEntry = rates.fetchedAt
+		}
+	}
+
+	return entries, oldestEntry
+}

+ 170 - 0
pkg/currency/cache_test.go

@@ -0,0 +1,170 @@
+package currency
+
+import (
+	"testing"
+	"time"
+)
+
+func TestMemoryCache_SetAndGet(t *testing.T) {
+	cache := newMemoryCache(1 * time.Hour)
+	defer cache.stop()
+
+	// Test setting and getting rates
+	rates := &cachedRates{
+		rates: map[string]float64{
+			"EUR": 0.85,
+			"GBP": 0.73,
+		},
+		baseCode:  "USD",
+		fetchedAt: time.Now(),
+	}
+
+	cache.set("USD", rates)
+
+	// Test successful get
+	retrieved, found := cache.get("USD")
+	if !found {
+		t.Error("expected to find cached rates")
+	}
+
+	if retrieved.baseCode != "USD" {
+		t.Errorf("expected base code USD, got %s", retrieved.baseCode)
+	}
+
+	if len(retrieved.rates) != 2 {
+		t.Errorf("expected 2 rates, got %d", len(retrieved.rates))
+	}
+
+	// Test non-existent key
+	_, found = cache.get("EUR")
+	if found {
+		t.Error("expected not to find rates for EUR")
+	}
+}
+
+func TestMemoryCache_Expiration(t *testing.T) {
+	// Use short TTL for testing
+	cache := newMemoryCache(100 * time.Millisecond)
+	defer cache.stop()
+
+	rates := &cachedRates{
+		rates: map[string]float64{
+			"EUR": 0.85,
+		},
+		baseCode:  "USD",
+		fetchedAt: time.Now(),
+	}
+
+	cache.set("USD", rates)
+
+	// Should find it immediately
+	_, found := cache.get("USD")
+	if !found {
+		t.Error("expected to find cached rates immediately")
+	}
+
+	// Wait for expiration
+	time.Sleep(150 * time.Millisecond)
+
+	// Should not find it after expiration
+	_, found = cache.get("USD")
+	if found {
+		t.Error("expected rates to be expired")
+	}
+}
+
+func TestMemoryCache_Clear(t *testing.T) {
+	cache := newMemoryCache(1 * time.Hour)
+	defer cache.stop()
+
+	// Add multiple entries
+	for _, base := range []string{"USD", "EUR", "GBP"} {
+		rates := &cachedRates{
+			rates:     map[string]float64{"TEST": 1.0},
+			baseCode:  base,
+			fetchedAt: time.Now(),
+		}
+		cache.set(base, rates)
+	}
+
+	// Verify all entries exist
+	for _, base := range []string{"USD", "EUR", "GBP"} {
+		_, found := cache.get(base)
+		if !found {
+			t.Errorf("expected to find rates for %s", base)
+		}
+	}
+
+	// Clear cache
+	cache.clear()
+
+	// Verify all entries are gone
+	for _, base := range []string{"USD", "EUR", "GBP"} {
+		_, found := cache.get(base)
+		if found {
+			t.Errorf("expected not to find rates for %s after clear", base)
+		}
+	}
+}
+
+func TestMemoryCache_Stats(t *testing.T) {
+	cache := newMemoryCache(1 * time.Hour)
+	defer cache.stop()
+
+	// Initially empty
+	entries, _ := cache.stats()
+	if entries != 0 {
+		t.Errorf("expected 0 entries, got %d", entries)
+	}
+
+	// Add entries
+	now := time.Now()
+	for i, base := range []string{"USD", "EUR", "GBP"} {
+		rates := &cachedRates{
+			rates:     map[string]float64{"TEST": 1.0},
+			baseCode:  base,
+			fetchedAt: now.Add(time.Duration(i) * time.Minute),
+		}
+		cache.set(base, rates)
+	}
+
+	entries, oldest := cache.stats()
+	if entries != 3 {
+		t.Errorf("expected 3 entries, got %d", entries)
+	}
+
+	// The oldest should be the first one we added (USD)
+	if !oldest.Equal(now) {
+		t.Errorf("expected oldest entry to be %v, got %v", now, oldest)
+	}
+}
+
+func TestMemoryCache_Cleanup(t *testing.T) {
+	// Use very short TTL for testing
+	cache := newMemoryCache(50 * time.Millisecond)
+	defer cache.stop()
+
+	// Add entry
+	rates := &cachedRates{
+		rates:     map[string]float64{"EUR": 0.85},
+		baseCode:  "USD",
+		fetchedAt: time.Now(),
+	}
+	cache.set("USD", rates)
+
+	// Verify it exists
+	entries, _ := cache.stats()
+	if entries != 1 {
+		t.Errorf("expected 1 entry, got %d", entries)
+	}
+
+	// Wait for cleanup cycle (janitor runs every TTL/2 = 25ms)
+	// Wait a bit longer to ensure cleanup has run
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify it's been cleaned up
+	entries, _ = cache.stats()
+	if entries != 0 {
+		t.Errorf("expected 0 entries after cleanup, got %d", entries)
+	}
+}

+ 89 - 0
pkg/currency/client.go

@@ -0,0 +1,89 @@
+package currency
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"time"
+)
+
+const (
+	apiBaseURL = "https://v6.exchangerate-api.com/v6"
+	userAgent  = "opencost-plugins/1.0"
+)
+
+type httpClient interface {
+	Do(req *http.Request) (*http.Response, error)
+}
+
+type exchangeRateClient struct {
+	apiKey     string
+	httpClient httpClient
+	timeout    time.Duration
+}
+
+func newExchangeRateClient(apiKey string, timeout time.Duration) *exchangeRateClient {
+	if timeout == 0 {
+		timeout = 10 * time.Second
+	}
+
+	return &exchangeRateClient{
+		apiKey: apiKey,
+		httpClient: &http.Client{
+			Timeout: timeout,
+		},
+		timeout: timeout,
+	}
+}
+
+func (c *exchangeRateClient) fetchRates(baseCurrency string) (*exchangeRateResponse, error) {
+	if c.apiKey == "" {
+		return nil, fmt.Errorf("API key is required")
+	}
+
+	if baseCurrency == "" {
+		baseCurrency = "USD"
+	}
+
+	url := fmt.Sprintf("%s/%s/latest/%s", apiBaseURL, c.apiKey, baseCurrency)
+
+	req, err := http.NewRequest("GET", url, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("User-Agent", userAgent)
+	req.Header.Set("Accept", "application/json")
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch exchange rates: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
+	}
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read response body: %w", err)
+	}
+
+	var response exchangeRateResponse
+	if err := json.Unmarshal(body, &response); err != nil {
+		return nil, fmt.Errorf("failed to parse response: %w", err)
+	}
+
+	if response.Result != "success" {
+		return nil, fmt.Errorf("API returned error result: %s", response.Result)
+	}
+
+	if len(response.ConversionRates) == 0 {
+		return nil, fmt.Errorf("no conversion rates returned")
+	}
+
+	return &response, nil
+}

+ 105 - 0
pkg/currency/converter.go

@@ -0,0 +1,105 @@
+package currency
+
+import (
+	"fmt"
+	"strings"
+	"sync"
+	"time"
+)
+
+type currencyConverter struct {
+	client client
+	cache  cache
+	config Config
+	mu     sync.RWMutex
+}
+
+func NewConverter(config Config) (Converter, error) {
+	if config.APIKey == "" {
+		return nil, fmt.Errorf("API key is required")
+	}
+
+	if config.CacheTTL == 0 {
+		config.CacheTTL = 24 * time.Hour
+	}
+
+	if config.APITimeout == 0 {
+		config.APITimeout = 10 * time.Second
+	}
+
+	client := newExchangeRateClient(config.APIKey, config.APITimeout)
+	cache := newMemoryCache(config.CacheTTL)
+
+	return &currencyConverter{
+		client: client,
+		cache:  cache,
+		config: config,
+	}, nil
+}
+
+func (c *currencyConverter) Convert(amount float64, from, to string) (float64, error) {
+	from = strings.ToUpper(strings.TrimSpace(from))
+	to = strings.ToUpper(strings.TrimSpace(to))
+
+	if from == to {
+		return amount, nil
+	}
+
+	rate, err := c.GetRate(from, to)
+	if err != nil {
+		return 0, fmt.Errorf("failed to get exchange rate from %s to %s: %w", from, to, err)
+	}
+
+	return amount * rate, nil
+}
+
+func (c *currencyConverter) GetRate(from, to string) (float64, error) {
+	from = strings.ToUpper(strings.TrimSpace(from))
+	to = strings.ToUpper(strings.TrimSpace(to))
+
+	if from == to {
+		return 1.0, nil
+	}
+
+	cachedRates, found := c.cache.get(from)
+	if found && cachedRates.rates != nil {
+		if rate, exists := cachedRates.rates[to]; exists {
+			return rate, nil
+		}
+	}
+
+	rates, err := c.fetchAndCacheRates(from)
+	if err != nil {
+		return 0, err
+	}
+
+	rate, exists := rates[to]
+	if !exists {
+		return 0, fmt.Errorf("currency %s not supported or not found in exchange rates", to)
+	}
+
+	return rate, nil
+}
+
+func (c *currencyConverter) fetchAndCacheRates(baseCurrency string) (map[string]float64, error) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if cachedRates, found := c.cache.get(baseCurrency); found {
+		return cachedRates.rates, nil
+	}
+
+	response, err := c.client.fetchRates(baseCurrency)
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch rates from API: %w", err)
+	}
+
+	cachedRates := &cachedRates{
+		rates:     response.ConversionRates,
+		baseCode:  response.BaseCode,
+		fetchedAt: time.Now(),
+	}
+	c.cache.set(baseCurrency, cachedRates)
+
+	return response.ConversionRates, nil
+}

+ 228 - 0
pkg/currency/converter_test.go

@@ -0,0 +1,228 @@
+package currency
+
+import (
+	"fmt"
+	"testing"
+	"time"
+)
+
+type mockClient struct {
+	rates map[string]map[string]float64
+	err   error
+}
+
+func (m *mockClient) fetchRates(baseCurrency string) (*exchangeRateResponse, error) {
+	if m.err != nil {
+		return nil, m.err
+	}
+
+	rates, exists := m.rates[baseCurrency]
+	if !exists {
+		return nil, fmt.Errorf("no rates for base currency %s", baseCurrency)
+	}
+
+	return &exchangeRateResponse{
+		Result:          "success",
+		BaseCode:        baseCurrency,
+		ConversionRates: rates,
+	}, nil
+}
+
+type mockCache struct {
+	data map[string]*cachedRates
+}
+
+func newMockCache() *mockCache {
+	return &mockCache{
+		data: make(map[string]*cachedRates),
+	}
+}
+
+func (m *mockCache) get(baseCurrency string) (*cachedRates, bool) {
+	rates, exists := m.data[baseCurrency]
+	if !exists || time.Now().After(rates.validUntil) {
+		return nil, false
+	}
+	return rates, true
+}
+
+func (m *mockCache) set(baseCurrency string, rates *cachedRates) {
+	m.data[baseCurrency] = rates
+}
+
+func (m *mockCache) clear() {
+	m.data = make(map[string]*cachedRates)
+}
+
+func TestCurrencyConverter_Convert(t *testing.T) {
+	mockClient := &mockClient{
+		rates: map[string]map[string]float64{
+			"USD": {
+				"USD": 1.0,
+				"EUR": 0.85,
+				"GBP": 0.73,
+				"JPY": 110.0,
+			},
+			"EUR": {
+				"EUR": 1.0,
+				"USD": 1.18,
+				"GBP": 0.86,
+				"JPY": 129.53,
+			},
+		},
+	}
+
+	converter := &currencyConverter{
+		client: mockClient,
+		cache:  newMockCache(),
+		config: Config{APIKey: "test"},
+	}
+
+	tests := []struct {
+		name        string
+		amount      float64
+		from        string
+		to          string
+		expected    float64
+		expectError bool
+	}{
+		{
+			name:     "USD to EUR",
+			amount:   100,
+			from:     "USD",
+			to:       "EUR",
+			expected: 85,
+		},
+		{
+			name:     "USD to GBP",
+			amount:   100,
+			from:     "USD",
+			to:       "GBP",
+			expected: 73,
+		},
+		{
+			name:     "EUR to USD",
+			amount:   100,
+			from:     "EUR",
+			to:       "USD",
+			expected: 118,
+		},
+		{
+			name:     "Same currency",
+			amount:   100,
+			from:     "USD",
+			to:       "USD",
+			expected: 100,
+		},
+		{
+			name:     "Case insensitive",
+			amount:   100,
+			from:     "usd",
+			to:       "eur",
+			expected: 85,
+		},
+		{
+			name:        "Unsupported currency",
+			amount:      100,
+			from:        "USD",
+			to:          "XYZ",
+			expectError: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := converter.Convert(tt.amount, tt.from, tt.to)
+
+			if tt.expectError {
+				if err == nil {
+					t.Errorf("expected error but got none")
+				}
+				return
+			}
+
+			if err != nil {
+				t.Errorf("unexpected error: %v", err)
+				return
+			}
+
+			if result != tt.expected {
+				t.Errorf("expected %f, got %f", tt.expected, result)
+			}
+		})
+	}
+}
+
+func TestCurrencyConverter_GetRate(t *testing.T) {
+	mockClient := &mockClient{
+		rates: map[string]map[string]float64{
+			"USD": {
+				"USD": 1.0,
+				"EUR": 0.85,
+				"GBP": 0.73,
+			},
+		},
+	}
+
+	converter := &currencyConverter{
+		client: mockClient,
+		cache:  newMockCache(),
+		config: Config{APIKey: "test"},
+	}
+
+	// Test getting rate
+	rate, err := converter.GetRate("USD", "EUR")
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	if rate != 0.85 {
+		t.Errorf("expected rate 0.85, got %f", rate)
+	}
+
+	// Test same currency
+	rate, err = converter.GetRate("USD", "USD")
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	if rate != 1.0 {
+		t.Errorf("expected rate 1.0, got %f", rate)
+	}
+
+	// Test cache hit
+	rate, err = converter.GetRate("USD", "EUR")
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	if rate != 0.85 {
+		t.Errorf("expected cached rate 0.85, got %f", rate)
+	}
+}
+
+func TestNewConverter(t *testing.T) {
+	// Test with empty API key
+	_, err := NewConverter(Config{})
+	if err == nil {
+		t.Error("expected error for empty API key")
+	}
+
+	// Test with valid config
+	converter, err := NewConverter(Config{APIKey: "test-key"})
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+
+	// Convert to concrete type to access internal fields
+	cc, ok := converter.(*currencyConverter)
+	if !ok {
+		t.Error("expected converter to be of type *currencyConverter")
+		return
+	}
+
+	if cc.config.CacheTTL != 24*time.Hour {
+		t.Errorf("expected default cache TTL of 24h, got %v", cc.config.CacheTTL)
+	}
+
+	if cc.config.APITimeout != 10*time.Second {
+		t.Errorf("expected default API timeout of 10s, got %v", cc.config.APITimeout)
+	}
+}

+ 60 - 0
pkg/currency/types.go

@@ -0,0 +1,60 @@
+package currency
+
+import (
+	"time"
+)
+
+// Config holds configuration for the currency converter
+type Config struct {
+	APIKey     string
+	CacheTTL   time.Duration
+	APITimeout time.Duration
+}
+
+// Converter interface defines currency conversion operations
+type Converter interface {
+	// Convert converts an amount from one currency to another
+	Convert(amount float64, from, to string) (float64, error)
+
+	// GetRate returns the exchange rate between two currencies
+	GetRate(from, to string) (float64, error)
+}
+
+// exchangeRateResponse represents the API response from exchangerate-api.com
+type exchangeRateResponse struct {
+	Result             string             `json:"result"`
+	Documentation      string             `json:"documentation"`
+	TermsOfUse         string             `json:"terms_of_use"`
+	TimeLastUpdateUnix int64              `json:"time_last_update_unix"`
+	TimeLastUpdateUTC  string             `json:"time_last_update_utc"`
+	TimeNextUpdateUnix int64              `json:"time_next_update_unix"`
+	TimeNextUpdateUTC  string             `json:"time_next_update_utc"`
+	BaseCode           string             `json:"base_code"`
+	ConversionRates    map[string]float64 `json:"conversion_rates"`
+}
+
+// cachedRates stores exchange rates with metadata
+type cachedRates struct {
+	rates      map[string]float64
+	baseCode   string
+	fetchedAt  time.Time
+	validUntil time.Time
+}
+
+// client interface for fetching exchange rates
+type client interface {
+	// fetchRates fetches current exchange rates for a base currency
+	fetchRates(baseCurrency string) (*exchangeRateResponse, error)
+}
+
+// cache interface for storing exchange rates
+type cache interface {
+	// get retrieves cached rates for a base currency
+	get(baseCurrency string) (*cachedRates, bool)
+
+	// set stores rates for a base currency with TTL
+	set(baseCurrency string, rates *cachedRates)
+
+	// clear removes all cached rates
+	clear()
+}

+ 2 - 2
pkg/customcost/ingestor.go

@@ -42,8 +42,8 @@ type CustomCostIngestorConfig struct {
 // DefaultIngestorConfiguration retrieves an CustomCostIngestorConfig from env variables
 func DefaultIngestorConfiguration() CustomCostIngestorConfig {
 	return CustomCostIngestorConfig{
-		DailyDuration:       timeutil.Day * time.Duration(env.GetDataRetentionDailyResolutionDays()),
-		HourlyDuration:      time.Hour * time.Duration(env.GetDataRetentionHourlyResolutionHours()),
+		DailyDuration:       timeutil.Day * time.Duration(env.GetCustomCost1dRetention()),
+		HourlyDuration:      time.Hour * time.Duration(env.GetCustomCost1hRetention()),
 		DailyQueryWindow:    timeutil.Day * time.Duration(env.GetCustomCostQueryWindowDays()),
 		HourlyQueryWindow:   time.Hour * time.Duration(env.GetCustomCostQueryWindowHours()),
 		PluginConfigDir:     env.GetPluginConfigDir(),

+ 2 - 2
pkg/customcost/querier.go

@@ -41,7 +41,7 @@ func getCustomCostAccumulateOption(window opencost.Window, from []opencost.Accum
 		from = allSteppedAccumulateOptions
 	}
 
-	hourlyStoreHours := env.GetDataRetentionHourlyResolutionHours()
+	hourlyStoreHours := env.GetCustomCost1hRetention()
 	hourlySteps := time.Duration(hourlyStoreHours) * time.Hour
 	oldestHourly := time.Now().Add(-1 * hourlySteps)
 
@@ -53,7 +53,7 @@ func getCustomCostAccumulateOption(window opencost.Window, from []opencost.Accum
 		return opencost.AccumulateOptionHour, nil
 	}
 
-	dailyStoreDays := env.GetDataRetentionDailyResolutionDays()
+	dailyStoreDays := env.GetCloudCost1dRetention()
 	dailySteps := time.Duration(dailyStoreDays) * timeutil.Day
 	oldestDaily := time.Now().Add(-1 * dailySteps)
 	// Use daily if...

+ 16 - 2
pkg/env/cloudcost.go

@@ -11,12 +11,14 @@ const (
 )
 
 const (
+	CloudCostEnvVarPrefix           = "CLOUD_COST_"
 	CloudCostEnabledEnvVar          = "CLOUD_COST_ENABLED"
 	CloudCostMonthToDateIntervalVar = "CLOUD_COST_MONTH_TO_DATE_INTERVAL"
 	CloudCostRefreshRateHoursEnvVar = "CLOUD_COST_REFRESH_RATE_HOURS"
 	CloudCostQueryWindowDaysEnvVar  = "CLOUD_COST_QUERY_WINDOW_DAYS"
 	CloudCostRunWindowDaysEnvVar    = "CLOUD_COST_RUN_WINDOW_DAYS"
 
+	CustomCostEnvVarPrefix          = "CUSTOM_COST_"
 	CustomCostEnabledEnvVar         = "CUSTOM_COST_ENABLED"
 	CustomCostQueryWindowDaysEnvVar = "CUSTOM_COST_QUERY_WINDOW_DAYS"
 
@@ -50,6 +52,14 @@ func GetCloudCostQueryWindowDays() int64 {
 	return env.GetInt64(CloudCostQueryWindowDaysEnvVar, 7)
 }
 
+func GetCloudCostRunWindowDays() int64 {
+	return env.GetInt64(CloudCostRunWindowDaysEnvVar, 3)
+}
+
+func GetCloudCost1dRetention() int {
+	return env.GetPrefixInt(CloudCostEnvVarPrefix, env.Resolution1dRetentionEnvVar, 30)
+}
+
 func GetCustomCostQueryWindowHours() int64 {
 	return env.GetInt64(CustomCostQueryWindowDaysEnvVar, 1)
 }
@@ -58,8 +68,12 @@ func GetCustomCostQueryWindowDays() int64 {
 	return env.GetInt64(CustomCostQueryWindowDaysEnvVar, 7)
 }
 
-func GetCloudCostRunWindowDays() int64 {
-	return env.GetInt64(CloudCostRunWindowDaysEnvVar, 3)
+func GetCustomCost1dRetention() int {
+	return env.GetPrefixInt(CustomCostEnvVarPrefix, env.Resolution1dRetentionEnvVar, 30)
+}
+
+func GetCustomCost1hRetention() int {
+	return env.GetPrefixInt(CustomCostEnvVarPrefix, env.Resolution1hRetentionEnvVar, 49)
 }
 
 func GetPluginConfigDir() string {

+ 7 - 13
pkg/env/costmodel.go

@@ -32,7 +32,8 @@ const (
 	AzureOfferIDEnvVar        = "AZURE_OFFER_ID"
 	AzureBillingAccountEnvVar = "AZURE_BILLING_ACCOUNT"
 
-	OCIPricingURL = "OCI_PRICING_URL"
+	// Currently being used for OCI and DigitalOcean
+	ProviderPricingURL = "PROVIDER_PRICING_URL"
 
 	ClusterProfileEnvVar    = "CLUSTER_PROFILE"
 	RemoteEnabledEnvVar     = "REMOTE_WRITE_ENABLED"
@@ -78,9 +79,6 @@ const (
 	ExportCSVLabelsAll  = "EXPORT_CSV_LABELS_ALL"
 	ExportCSVMaxDays    = "EXPORT_CSV_MAX_DAYS"
 
-	DataRetentionDailyResolutionDaysEnvVar   = "DATA_RETENTION_DAILY_RESOLUTION_DAYS"
-	DataRetentionHourlyResolutionHoursEnvVar = "DATA_RETENTION_HOURLY_RESOLUTION_HOURS"
-
 	CarbonEstimatesEnabledEnvVar = "CARBON_ESTIMATES_ENABLED"
 
 	KubernetesResourceAccessEnvVar = "KUBERNETES_RESOURCE_ACCESS"
@@ -331,20 +329,12 @@ func GetRegionOverrideList() []string {
 	return regionList
 }
 
-func GetDataRetentionDailyResolutionDays() int64 {
-	return env.GetInt64(DataRetentionDailyResolutionDaysEnvVar, 30)
-}
-
-func GetDataRetentionHourlyResolutionHours() int64 {
-	return env.GetInt64(DataRetentionHourlyResolutionHoursEnvVar, 49)
-}
-
 func IsKubernetesEnabled() bool {
 	return env.Get(KubernetesEnabledEnvVar, "") != ""
 }
 
 func GetOCIPricingURL() string {
-	return env.Get(OCIPricingURL, "https://apexapps.oracle.com/pls/apex/cetools/api/v1/products")
+	return env.Get(ProviderPricingURL, "https://apexapps.oracle.com/pls/apex/cetools/api/v1/products")
 }
 
 func IsCarbonEstimatesEnabled() bool {
@@ -373,3 +363,7 @@ func GetLocalCollectorDirectory() string {
 	dir := env.Get(LocalCollectorDirectoryEnvVar, DefaultLocalCollectorDir)
 	return env.GetPathFromConfig(dir)
 }
+
+func GetDOKSPricingURL() string {
+	return env.Get(ProviderPricingURL, "https://api.digitalocean.com/v2/billing/pricing")
+}

+ 30 - 6
pkg/metrics/deploymentmetrics.go

@@ -41,10 +41,11 @@ func (kdc KubecostDeploymentCollector) Collect(ch chan<- prometheus.Metric) {
 	for _, deployment := range ds {
 		deploymentName := deployment.Name
 		deploymentNS := deployment.Namespace
+		deploymentUID := string(deployment.UID)
 
 		labels, values := promutil.KubeLabelsToLabels(promutil.SanitizeLabels(deployment.MatchLabels))
 		if len(labels) > 0 {
-			m := newDeploymentMatchLabelsMetric(deploymentName, deploymentNS, "deployment_match_labels", labels, values)
+			m := newDeploymentMatchLabelsMetric(deploymentName, deploymentNS, "deployment_match_labels", labels, values, deploymentUID)
 			ch <- m
 		}
 	}
@@ -63,10 +64,11 @@ type DeploymentMatchLabelsMetric struct {
 	labelValues    []string
 	deploymentName string
 	namespace      string
+	uid            string
 }
 
 // Creates a new DeploymentMatchLabelsMetric, implementation of prometheus.Metric
-func newDeploymentMatchLabelsMetric(name, namespace, fqname string, labelNames, labelvalues []string) DeploymentMatchLabelsMetric {
+func newDeploymentMatchLabelsMetric(name, namespace, fqname string, labelNames, labelvalues []string, uid string) DeploymentMatchLabelsMetric {
 	return DeploymentMatchLabelsMetric{
 		fqName:         fqname,
 		labelNames:     labelNames,
@@ -74,6 +76,7 @@ func newDeploymentMatchLabelsMetric(name, namespace, fqname string, labelNames,
 		help:           "deployment_match_labels Deployment Match Labels",
 		deploymentName: name,
 		namespace:      namespace,
+		uid:            uid,
 	}
 }
 
@@ -83,6 +86,7 @@ func (dmlm DeploymentMatchLabelsMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"deployment": dmlm.deploymentName,
 		"namespace":  dmlm.namespace,
+		"uid":        dmlm.uid,
 	}
 	return prometheus.NewDesc(dmlm.fqName, dmlm.help, dmlm.labelNames, l)
 }
@@ -109,6 +113,10 @@ func (dmlm DeploymentMatchLabelsMetric) Write(m *dto.Metric) error {
 		Name:  toStringPtr("deployment"),
 		Value: &dmlm.deploymentName,
 	})
+	labels = append(labels, &dto.LabelPair{
+		Name:  toStringPtr("uid"),
+		Value: &dmlm.uid,
+	})
 	m.Label = labels
 	return nil
 }
@@ -145,6 +153,7 @@ func (kdc KubeDeploymentCollector) Collect(ch chan<- prometheus.Metric) {
 	for _, deployment := range deployments {
 		deploymentName := deployment.Name
 		deploymentNS := deployment.Namespace
+		deploymentUID := string(deployment.UID)
 
 		// Replicas Defined
 		var replicas int32
@@ -155,7 +164,7 @@ func (kdc KubeDeploymentCollector) Collect(ch chan<- prometheus.Metric) {
 		}
 
 		if _, disabled := disabledMetrics["kube_deployment_spec_replicas"]; !disabled {
-			ch <- newKubeDeploymentReplicasMetric("kube_deployment_spec_replicas", deploymentName, deploymentNS, replicas)
+			ch <- newKubeDeploymentReplicasMetric("kube_deployment_spec_replicas", deploymentName, deploymentNS, replicas, deploymentUID)
 		}
 		if _, disabled := disabledMetrics["kube_deployment_status_replicas_available"]; !disabled {
 			// Replicas Available
@@ -163,7 +172,8 @@ func (kdc KubeDeploymentCollector) Collect(ch chan<- prometheus.Metric) {
 				"kube_deployment_status_replicas_available",
 				deploymentName,
 				deploymentNS,
-				deployment.StatusAvailableReplicas)
+				deployment.StatusAvailableReplicas,
+				deploymentUID)
 		}
 	}
 }
@@ -179,16 +189,18 @@ type KubeDeploymentReplicasMetric struct {
 	deployment string
 	namespace  string
 	replicas   float64
+	uid        string
 }
 
 // Creates a new DeploymentMatchLabelsMetric, implementation of prometheus.Metric
-func newKubeDeploymentReplicasMetric(fqname, deployment, namespace string, replicas int32) KubeDeploymentReplicasMetric {
+func newKubeDeploymentReplicasMetric(fqname, deployment, namespace string, replicas int32, uid string) KubeDeploymentReplicasMetric {
 	return KubeDeploymentReplicasMetric{
 		fqName:     fqname,
 		help:       "kube_deployment_spec_replicas Number of desired pods for a deployment.",
 		deployment: deployment,
 		namespace:  namespace,
 		replicas:   float64(replicas),
+		uid:        uid,
 	}
 }
 
@@ -198,6 +210,7 @@ func (kdr KubeDeploymentReplicasMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"deployment": kdr.deployment,
 		"namespace":  kdr.namespace,
+		"uid":        kdr.uid,
 	}
 	return prometheus.NewDesc(kdr.fqName, kdr.help, []string{}, l)
 }
@@ -217,6 +230,10 @@ func (kdr KubeDeploymentReplicasMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("deployment"),
 			Value: &kdr.deployment,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kdr.uid,
+		},
 	}
 
 	return nil
@@ -233,16 +250,18 @@ type KubeDeploymentStatusAvailableReplicasMetric struct {
 	deployment        string
 	namespace         string
 	replicasAvailable float64
+	uid               string
 }
 
 // Creates a new DeploymentMatchLabelsMetric, implementation of prometheus.Metric
-func newKubeDeploymentStatusAvailableReplicasMetric(fqname, deployment, namespace string, replicasAvailable int32) KubeDeploymentStatusAvailableReplicasMetric {
+func newKubeDeploymentStatusAvailableReplicasMetric(fqname, deployment, namespace string, replicasAvailable int32, uid string) KubeDeploymentStatusAvailableReplicasMetric {
 	return KubeDeploymentStatusAvailableReplicasMetric{
 		fqName:            fqname,
 		help:              "kube_deployment_status_replicas_available The number of available replicas per deployment.",
 		deployment:        deployment,
 		namespace:         namespace,
 		replicasAvailable: float64(replicasAvailable),
+		uid:               uid,
 	}
 }
 
@@ -252,6 +271,7 @@ func (kdr KubeDeploymentStatusAvailableReplicasMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"deployment": kdr.deployment,
 		"namespace":  kdr.namespace,
+		"uid":        kdr.uid,
 	}
 	return prometheus.NewDesc(kdr.fqName, kdr.help, []string{}, l)
 }
@@ -271,6 +291,10 @@ func (kdr KubeDeploymentStatusAvailableReplicasMetric) Write(m *dto.Metric) erro
 			Name:  toStringPtr("deployment"),
 			Value: &kdr.deployment,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kdr.uid,
+		},
 	}
 
 	return nil

+ 541 - 0
pkg/metrics/deploymentmetrics_test.go

@@ -0,0 +1,541 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+func TestKubecostDeploymentCollector_Describe(t *testing.T) {
+	tests := []struct {
+		name            string
+		disabledMetrics []string
+		expectMetric    bool
+	}{
+		{
+			name:            "deployment_match_labels enabled",
+			disabledMetrics: []string{},
+			expectMetric:    true,
+		},
+		{
+			name:            "deployment_match_labels disabled",
+			disabledMetrics: []string{"deployment_match_labels"},
+			expectMetric:    false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			kdc := KubecostDeploymentCollector{
+				KubeClusterCache: NewFakeDeploymentCache([]*clustercache.Deployment{}),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan *prometheus.Desc, 10)
+			kdc.Describe(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if tt.expectMetric && count == 0 {
+				t.Error("Expected metric description but got none")
+			}
+			if !tt.expectMetric && count > 0 {
+				t.Error("Expected no metric description but got some")
+			}
+		})
+	}
+}
+
+func TestKubecostDeploymentCollector_Collect(t *testing.T) {
+	tests := []struct {
+		name            string
+		deployments     []*clustercache.Deployment
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name: "single deployment with match labels",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:         types.UID("test-uid-1"),
+					Name:        "test-deployment",
+					Namespace:   "default",
+					MatchLabels: map[string]string{"app": "test", "version": "v1"},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   1,
+		},
+		{
+			name: "deployment without match labels",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:         types.UID("test-uid-2"),
+					Name:        "empty-deployment",
+					Namespace:   "default",
+					MatchLabels: map[string]string{},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   0,
+		},
+		{
+			name: "multiple deployments with match labels",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:         types.UID("test-uid-3"),
+					Name:        "deployment1",
+					Namespace:   "ns1",
+					MatchLabels: map[string]string{"app": "app1"},
+				},
+				{
+					UID:         types.UID("test-uid-4"),
+					Name:        "deployment2",
+					Namespace:   "ns2",
+					MatchLabels: map[string]string{"component": "frontend", "tier": "web"},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   2,
+		},
+		{
+			name: "metric disabled",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:         types.UID("test-uid-5"),
+					Name:        "test-deployment",
+					Namespace:   "default",
+					MatchLabels: map[string]string{"app": "test"},
+				},
+			},
+			disabledMetrics: []string{"deployment_match_labels"},
+			expectedCount:   0,
+		},
+		{
+			name: "mixed deployments with and without labels",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:         types.UID("test-uid-6"),
+					Name:        "with-labels",
+					Namespace:   "default",
+					MatchLabels: map[string]string{"app": "test"},
+				},
+				{
+					UID:         types.UID("test-uid-7"),
+					Name:        "without-labels",
+					Namespace:   "default",
+					MatchLabels: map[string]string{},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   1,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			kdc := KubecostDeploymentCollector{
+				KubeClusterCache: NewFakeDeploymentCache(tt.deployments),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan prometheus.Metric, 10)
+			kdc.Collect(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestDeploymentMatchLabelsMetric(t *testing.T) {
+	labelNames := []string{"app", "version", "tier"}
+	labelValues := []string{"myapp", "v2.0", "backend"}
+	uid := "test-deployment-uid"
+
+	metric := newDeploymentMatchLabelsMetric("test-deployment", "production", "deployment_match_labels", labelNames, labelValues, uid)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"app":        "myapp",
+		"version":    "v2.0",
+		"tier":       "backend",
+		"deployment": "test-deployment",
+		"namespace":  "production",
+		"uid":        uid,
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestKubeDeploymentCollector_Describe(t *testing.T) {
+	tests := []struct {
+		name            string
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name:            "all metrics enabled",
+			disabledMetrics: []string{},
+			expectedCount:   2,
+		},
+		{
+			name:            "spec replicas disabled",
+			disabledMetrics: []string{"kube_deployment_spec_replicas"},
+			expectedCount:   1,
+		},
+		{
+			name:            "status replicas disabled",
+			disabledMetrics: []string{"kube_deployment_status_replicas_available"},
+			expectedCount:   1,
+		},
+		{
+			name:            "all metrics disabled",
+			disabledMetrics: []string{"kube_deployment_spec_replicas", "kube_deployment_status_replicas_available"},
+			expectedCount:   0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			kdc := KubeDeploymentCollector{
+				KubeClusterCache: NewFakeDeploymentCache([]*clustercache.Deployment{}),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan *prometheus.Desc, 10)
+			kdc.Describe(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestKubeDeploymentCollector_Collect(t *testing.T) {
+	replicas3 := int32(3)
+	replicas0 := int32(0)
+
+	tests := []struct {
+		name            string
+		deployments     []*clustercache.Deployment
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name: "deployment with explicit replicas",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:                     types.UID("test-uid-1"),
+					Name:                    "test-deployment",
+					Namespace:               "default",
+					SpecReplicas:            &replicas3,
+					StatusAvailableReplicas: 2,
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   2, // spec replicas + status available replicas
+		},
+		{
+			name: "deployment with nil replicas defaults to 1",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:                     types.UID("test-uid-2"),
+					Name:                    "default-replicas",
+					Namespace:               "default",
+					SpecReplicas:            nil,
+					StatusAvailableReplicas: 1,
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   2,
+		},
+		{
+			name: "deployment with zero replicas",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:                     types.UID("test-uid-3"),
+					Name:                    "zero-replicas",
+					Namespace:               "default",
+					SpecReplicas:            &replicas0,
+					StatusAvailableReplicas: 0,
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   2,
+		},
+		{
+			name: "multiple deployments",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:                     types.UID("test-uid-4"),
+					Name:                    "deployment1",
+					Namespace:               "ns1",
+					SpecReplicas:            &replicas3,
+					StatusAvailableReplicas: 3,
+				},
+				{
+					UID:                     types.UID("test-uid-5"),
+					Name:                    "deployment2",
+					Namespace:               "ns2",
+					SpecReplicas:            nil,
+					StatusAvailableReplicas: 0,
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   4, // 2 metrics per deployment
+		},
+		{
+			name: "spec replicas disabled",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:                     types.UID("test-uid-6"),
+					Name:                    "test-deployment",
+					Namespace:               "default",
+					SpecReplicas:            &replicas3,
+					StatusAvailableReplicas: 2,
+				},
+			},
+			disabledMetrics: []string{"kube_deployment_spec_replicas"},
+			expectedCount:   1, // only status available replicas
+		},
+		{
+			name: "status replicas disabled",
+			deployments: []*clustercache.Deployment{
+				{
+					UID:                     types.UID("test-uid-7"),
+					Name:                    "test-deployment",
+					Namespace:               "default",
+					SpecReplicas:            &replicas3,
+					StatusAvailableReplicas: 2,
+				},
+			},
+			disabledMetrics: []string{"kube_deployment_status_replicas_available"},
+			expectedCount:   1, // only spec replicas
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			kdc := KubeDeploymentCollector{
+				KubeClusterCache: NewFakeDeploymentCache(tt.deployments),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan prometheus.Metric, 10)
+			kdc.Collect(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestKubeDeploymentReplicasMetric(t *testing.T) {
+	metric := newKubeDeploymentReplicasMetric("kube_deployment_spec_replicas", "web-app", "production", 5, "deployment-uid")
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 5.0 {
+		t.Errorf("Expected gauge value 5.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"deployment": "web-app",
+		"namespace":  "production",
+		"uid":        "deployment-uid",
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestKubeDeploymentStatusAvailableReplicasMetric(t *testing.T) {
+	metric := newKubeDeploymentStatusAvailableReplicasMetric("kube_deployment_status_replicas_available", "api-server", "backend", 3, "api-uid")
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 3.0 {
+		t.Errorf("Expected gauge value 3.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"deployment": "api-server",
+		"namespace":  "backend",
+		"uid":        "api-uid",
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestKubeDeploymentCollector_DefaultReplicas(t *testing.T) {
+	// Test that nil replicas defaults to 1
+	deployment := &clustercache.Deployment{
+		UID:                     types.UID("test-uid"),
+		Name:                    "test-deployment",
+		Namespace:               "default",
+		SpecReplicas:            nil,
+		StatusAvailableReplicas: 0,
+	}
+
+	mc := MetricsConfig{
+		DisabledMetrics: []string{"kube_deployment_status_replicas_available"}, // Only test spec replicas
+	}
+	kdc := KubeDeploymentCollector{
+		KubeClusterCache: NewFakeDeploymentCache([]*clustercache.Deployment{deployment}),
+		metricsConfig:    mc,
+	}
+
+	ch := make(chan prometheus.Metric, 10)
+	kdc.Collect(ch)
+	close(ch)
+
+	for metric := range ch {
+		var dtoMetric dto.Metric
+		metric.Write(&dtoMetric)
+		if *dtoMetric.Gauge.Value != 1.0 {
+			t.Errorf("Expected default replicas value 1.0, got %f", *dtoMetric.Gauge.Value)
+		}
+	}
+}
+
+// FakeDeploymentCache implements ClusterCache interface for testing
+type FakeDeploymentCache struct {
+	clustercache.ClusterCache
+	deployments []*clustercache.Deployment
+}
+
+func (f FakeDeploymentCache) GetAllDeployments() []*clustercache.Deployment {
+	return f.deployments
+}
+
+func NewFakeDeploymentCache(deployments []*clustercache.Deployment) FakeDeploymentCache {
+	return FakeDeploymentCache{
+		deployments: deployments,
+	}
+}

+ 12 - 4
pkg/metrics/jobmetrics.go

@@ -43,9 +43,10 @@ func (kjc KubeJobCollector) Collect(ch chan<- prometheus.Metric) {
 	for _, job := range jobs {
 		jobName := job.Name
 		jobNS := job.Namespace
+		jobUID := string(job.UID)
 
 		if job.Status.Failed == 0 {
-			ch <- newKubeJobStatusFailedMetric(jobName, jobNS, "kube_job_status_failed", "", 0)
+			ch <- newKubeJobStatusFailedMetric(jobName, jobNS, jobUID, "kube_job_status_failed", "", 0)
 		} else {
 			for _, condition := range job.Status.Conditions {
 				if condition.Type == batchv1.JobFailed {
@@ -53,12 +54,12 @@ func (kjc KubeJobCollector) Collect(ch chan<- prometheus.Metric) {
 					for _, reason := range jobFailureReasons {
 						reasonKnown = reasonKnown || failureReason(&condition, reason)
 
-						ch <- newKubeJobStatusFailedMetric(jobName, jobNS, "kube_job_status_failed", reason, boolFloat64(failureReason(&condition, reason)))
+						ch <- newKubeJobStatusFailedMetric(jobName, jobNS, jobUID, "kube_job_status_failed", reason, boolFloat64(failureReason(&condition, reason)))
 					}
 
 					// for unknown reasons
 					if !reasonKnown {
-						ch <- newKubeJobStatusFailedMetric(jobName, jobNS, "kube_job_status_failed", "", float64(job.Status.Failed))
+						ch <- newKubeJobStatusFailedMetric(jobName, jobNS, jobUID, "kube_job_status_failed", "", float64(job.Status.Failed))
 					}
 				}
 			}
@@ -77,17 +78,19 @@ type KubeJobStatusFailedMetric struct {
 	help      string
 	job       string
 	namespace string
+	uid       string
 	reason    string
 	value     float64
 }
 
 // Creates a new KubeJobStatusFailedMetric, implementation of prometheus.Metric
-func newKubeJobStatusFailedMetric(job, namespace, fqName, reason string, value float64) KubeJobStatusFailedMetric {
+func newKubeJobStatusFailedMetric(job, namespace, uid, fqName, reason string, value float64) KubeJobStatusFailedMetric {
 	return KubeJobStatusFailedMetric{
 		fqName:    fqName,
 		help:      "kube_job_status_failed Failed job",
 		job:       job,
 		namespace: namespace,
+		uid:       uid,
 		reason:    reason,
 		value:     value,
 	}
@@ -99,6 +102,7 @@ func (kjsf KubeJobStatusFailedMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"job_name":  kjsf.job,
 		"namespace": kjsf.namespace,
+		"uid":       kjsf.uid,
 		"reason":    kjsf.reason,
 	}
 	return prometheus.NewDesc(kjsf.fqName, kjsf.help, []string{}, l)
@@ -119,6 +123,10 @@ func (kjsf KubeJobStatusFailedMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("namespace"),
 			Value: &kjsf.namespace,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kjsf.uid,
+		},
 		{
 			Name:  toStringPtr("reason"),
 			Value: &kjsf.reason,

+ 91 - 0
pkg/metrics/jobmetrics_test.go

@@ -0,0 +1,91 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	batchv1 "k8s.io/api/batch/v1"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+type mockJobCache struct {
+	clustercache.ClusterCache
+	jobs []*clustercache.Job
+}
+
+func (m mockJobCache) GetAllJobs() []*clustercache.Job {
+	return m.jobs
+}
+
+func TestKubeJobCollector_Collect(t *testing.T) {
+	// Test with job that has no failures
+	cache := mockJobCache{
+		jobs: []*clustercache.Job{
+			{
+				Name:      "test-job",
+				Namespace: "default",
+				UID:       types.UID("test-job-uid"),
+				Status:    batchv1.JobStatus{Failed: 0},
+			},
+		},
+	}
+
+	collector := KubeJobCollector{
+		KubeClusterCache: cache,
+		metricsConfig:    MetricsConfig{},
+	}
+
+	ch := make(chan prometheus.Metric, 10)
+	go func() {
+		collector.Collect(ch)
+		close(ch)
+	}()
+
+	count := 0
+	for range ch {
+		count++
+	}
+
+	if count != 1 {
+		t.Errorf("Expected 1 metric, got %d", count)
+	}
+}
+
+func TestKubeJobStatusFailedMetric_Write(t *testing.T) {
+	metric := newKubeJobStatusFailedMetric(
+		"test-job",
+		"default",
+		"test-job-uid",
+		"kube_job_status_failed",
+		"",
+		0.0,
+	)
+
+	pbMetric := &dto.Metric{}
+	err := metric.Write(pbMetric)
+	if err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	if pbMetric.Gauge == nil || *pbMetric.Gauge.Value != 0.0 {
+		t.Error("Expected gauge value 0.0")
+	}
+
+	if len(pbMetric.Label) != 4 { // job_name + namespace + uid + reason
+		t.Errorf("Expected 4 labels, got %d", len(pbMetric.Label))
+	}
+
+	// Verify UID label is present
+	foundUID := false
+	for _, label := range pbMetric.Label {
+		if *label.Name == "uid" && *label.Value == "test-job-uid" {
+			foundUID = true
+			break
+		}
+	}
+	if !foundUID {
+		t.Error("Expected uid label not found")
+	}
+}

+ 29 - 12
pkg/metrics/namespacemetrics.go

@@ -40,10 +40,11 @@ func (nsac KubecostNamespaceCollector) Collect(ch chan<- prometheus.Metric) {
 	namespaces := nsac.KubeClusterCache.GetAllNamespaces()
 	for _, namespace := range namespaces {
 		nsName := namespace.Name
+		nsUID := string(namespace.UID)
 
 		labels, values := promutil.KubeAnnotationsToLabels(namespace.Annotations)
 		if len(labels) > 0 {
-			m := newNamespaceAnnotationsMetric("kube_namespace_annotations", nsName, labels, values)
+			m := newNamespaceAnnotationsMetric("kube_namespace_annotations", nsName, nsUID, labels, values)
 			ch <- m
 		}
 	}
@@ -59,16 +60,18 @@ type NamespaceAnnotationsMetric struct {
 	fqName      string
 	help        string
 	namespace   string
+	uid         string
 	labelNames  []string
 	labelValues []string
 }
 
 // Creates a new NamespaceAnnotationsMetric, implementation of prometheus.Metric
-func newNamespaceAnnotationsMetric(fqname, namespace string, labelNames []string, labelValues []string) NamespaceAnnotationsMetric {
+func newNamespaceAnnotationsMetric(fqname, namespace string, uid string, labelNames []string, labelValues []string) NamespaceAnnotationsMetric {
 	return NamespaceAnnotationsMetric{
 		fqName:      fqname,
 		help:        "kube_namespace_annotations Namespace Annotations",
 		namespace:   namespace,
+		uid:         uid,
 		labelNames:  labelNames,
 		labelValues: labelValues,
 	}
@@ -79,6 +82,7 @@ func newNamespaceAnnotationsMetric(fqname, namespace string, labelNames []string
 func (nam NamespaceAnnotationsMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"namespace": nam.namespace,
+		"uid":       nam.uid,
 	}
 	return prometheus.NewDesc(nam.fqName, nam.help, []string{}, l)
 }
@@ -98,10 +102,15 @@ func (nam NamespaceAnnotationsMetric) Write(m *dto.Metric) error {
 			Value: &nam.labelValues[i],
 		})
 	}
-	labels = append(labels, &dto.LabelPair{
-		Name:  toStringPtr("namespace"),
-		Value: &nam.namespace,
-	})
+	labels = append(labels,
+		&dto.LabelPair{
+			Name:  toStringPtr("namespace"),
+			Value: &nam.namespace,
+		},
+		&dto.LabelPair{
+			Name:  toStringPtr("uid"),
+			Value: &nam.uid,
+		})
 	m.Label = labels
 	return nil
 }
@@ -138,10 +147,11 @@ func (nsac KubeNamespaceCollector) Collect(ch chan<- prometheus.Metric) {
 	namespaces := nsac.KubeClusterCache.GetAllNamespaces()
 	for _, namespace := range namespaces {
 		nsName := namespace.Name
+		nsUID := string(namespace.UID)
 
 		labels, values := promutil.KubeLabelsToLabels(promutil.SanitizeLabels(namespace.Labels))
 		if len(labels) > 0 {
-			m := newKubeNamespaceLabelsMetric("kube_namespace_labels", nsName, labels, values)
+			m := newKubeNamespaceLabelsMetric("kube_namespace_labels", nsName, nsUID, labels, values)
 			ch <- m
 		}
 	}
@@ -157,14 +167,16 @@ type KubeNamespaceLabelsMetric struct {
 	fqName      string
 	help        string
 	namespace   string
+	uid         string
 	labelNames  []string
 	labelValues []string
 }
 
 // Creates a new KubeNamespaceLabelsMetric, implementation of prometheus.Metric
-func newKubeNamespaceLabelsMetric(fqname, namespace string, labelNames []string, labelValues []string) KubeNamespaceLabelsMetric {
+func newKubeNamespaceLabelsMetric(fqname, namespace string, uid string, labelNames []string, labelValues []string) KubeNamespaceLabelsMetric {
 	return KubeNamespaceLabelsMetric{
 		namespace:   namespace,
+		uid:         uid,
 		fqName:      fqname,
 		labelNames:  labelNames,
 		labelValues: labelValues,
@@ -195,10 +207,15 @@ func (nam KubeNamespaceLabelsMetric) Write(m *dto.Metric) error {
 			Value: &nam.labelValues[i],
 		})
 	}
-	labels = append(labels, &dto.LabelPair{
-		Name:  toStringPtr("namespace"),
-		Value: &nam.namespace,
-	})
+	labels = append(labels,
+		&dto.LabelPair{
+			Name:  toStringPtr("namespace"),
+			Value: &nam.namespace,
+		},
+		&dto.LabelPair{
+			Name:  toStringPtr("uid"),
+			Value: &nam.uid,
+		})
 	m.Label = labels
 	return nil
 }

+ 195 - 0
pkg/metrics/namespacemetrics_test.go

@@ -0,0 +1,195 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+type mockNamespaceCache struct {
+	clustercache.ClusterCache
+	namespaces []*clustercache.Namespace
+}
+
+func (m mockNamespaceCache) GetAllNamespaces() []*clustercache.Namespace {
+	return m.namespaces
+}
+
+func TestKubecostNamespaceCollector_Collect(t *testing.T) {
+	// Test with namespace that has annotations
+	cache := mockNamespaceCache{
+		namespaces: []*clustercache.Namespace{
+			{
+				Name:        "test-ns",
+				UID:         types.UID("test-uid"),
+				Annotations: map[string]string{"team": "backend"},
+			},
+		},
+	}
+	
+	collector := KubecostNamespaceCollector{
+		KubeClusterCache: cache,
+		metricsConfig:    MetricsConfig{},
+	}
+
+	ch := make(chan prometheus.Metric, 10)
+	go func() {
+		collector.Collect(ch)
+		close(ch)
+	}()
+
+	count := 0
+	for range ch {
+		count++
+	}
+
+	if count != 1 {
+		t.Errorf("Expected 1 metric, got %d", count)
+	}
+}
+
+func TestKubeNamespaceCollector_Collect(t *testing.T) {
+	// Test with namespace that has labels
+	cache := mockNamespaceCache{
+		namespaces: []*clustercache.Namespace{
+			{
+				Name:   "test-ns",
+				UID:    types.UID("test-uid"),
+				Labels: map[string]string{"env": "prod"},
+			},
+		},
+	}
+	
+	collector := KubeNamespaceCollector{
+		KubeClusterCache: cache,
+		metricsConfig:    MetricsConfig{},
+	}
+
+	ch := make(chan prometheus.Metric, 10)
+	go func() {
+		collector.Collect(ch)
+		close(ch)
+	}()
+
+	count := 0
+	for range ch {
+		count++
+	}
+
+	if count != 1 {
+		t.Errorf("Expected 1 metric, got %d", count)
+	}
+}
+
+func TestNamespaceAnnotationsMetric_Write(t *testing.T) {
+	metric := newNamespaceAnnotationsMetric(
+		"test_metric",
+		"test-ns",
+		"test-uid",
+		[]string{"team"},
+		[]string{"backend"},
+	)
+
+	pbMetric := &dto.Metric{}
+	err := metric.Write(pbMetric)
+	if err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	if pbMetric.Gauge == nil || *pbMetric.Gauge.Value != 1.0 {
+		t.Error("Expected gauge value 1.0")
+	}
+
+	if len(pbMetric.Label) != 3 { // team + namespace + uid
+		t.Errorf("Expected 3 labels, got %d", len(pbMetric.Label))
+	}
+
+	// Verify UID label exists and has correct value
+	foundUID := false
+	for _, label := range pbMetric.Label {
+		if *label.Name == "uid" && *label.Value == "test-uid" {
+			foundUID = true
+			break
+		}
+	}
+	if !foundUID {
+		t.Error("Expected uid label with value 'test-uid' not found")
+	}
+}
+
+func TestKubeNamespaceLabelsMetric_Write(t *testing.T) {
+	metric := newKubeNamespaceLabelsMetric(
+		"test_metric",
+		"test-ns", 
+		"test-uid",
+		[]string{"env"},
+		[]string{"prod"},
+	)
+
+	pbMetric := &dto.Metric{}
+	err := metric.Write(pbMetric)
+	if err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	if pbMetric.Gauge == nil || *pbMetric.Gauge.Value != 1.0 {
+		t.Error("Expected gauge value 1.0")
+	}
+
+	if len(pbMetric.Label) != 3 { // env + namespace + uid
+		t.Errorf("Expected 3 labels, got %d", len(pbMetric.Label))
+	}
+
+	// Verify UID label exists and has correct value
+	foundUID := false
+	for _, label := range pbMetric.Label {
+		if *label.Name == "uid" && *label.Value == "test-uid" {
+			foundUID = true
+			break
+		}
+	}
+	if !foundUID {
+		t.Error("Expected uid label with value 'test-uid' not found")
+	}
+}
+
+func TestKubecostNamespaceCollector_Describe(t *testing.T) {
+	collector := KubecostNamespaceCollector{metricsConfig: MetricsConfig{}}
+	
+	ch := make(chan *prometheus.Desc, 1)
+	go func() {
+		collector.Describe(ch)
+		close(ch)
+	}()
+
+	count := 0
+	for range ch {
+		count++
+	}
+
+	if count != 1 {
+		t.Errorf("Expected 1 descriptor, got %d", count)
+	}
+}
+
+func TestKubeNamespaceCollector_Describe(t *testing.T) {
+	collector := KubeNamespaceCollector{metricsConfig: MetricsConfig{}}
+	
+	ch := make(chan *prometheus.Desc, 1)
+	go func() {
+		collector.Describe(ch)
+		close(ch)
+	}()
+
+	count := 0
+	for range ch {
+		count++
+	}
+
+	if count != 1 {
+		t.Errorf("Expected 1 descriptor, got %d", count)
+	}
+}

+ 71 - 18
pkg/metrics/nodemetrics.go

@@ -63,6 +63,7 @@ func (nsac KubeNodeCollector) Collect(ch chan<- prometheus.Metric) {
 
 	for _, node := range nodes {
 		nodeName := node.Name
+		nodeUID := string(node.UID)
 
 		// Node Capacity
 		for resourceName, quantity := range node.Status.Capacity {
@@ -77,18 +78,18 @@ func (nsac KubeNodeCollector) Collect(ch chan<- prometheus.Metric) {
 			// KSM v1 Emission
 			if _, disabled := disabledMetrics["kube_node_status_capacity_cpu_cores"]; !disabled {
 				if resource == "cpu" {
-					ch <- newKubeNodeStatusCapacityCPUCoresMetric("kube_node_status_capacity_cpu_cores", nodeName, value)
+					ch <- newKubeNodeStatusCapacityCPUCoresMetric("kube_node_status_capacity_cpu_cores", nodeName, nodeUID, value)
 
 				}
 			}
 			if _, disabled := disabledMetrics["kube_node_status_capacity_memory_bytes"]; !disabled {
 				if resource == "memory" {
-					ch <- newKubeNodeStatusCapacityMemoryBytesMetric("kube_node_status_capacity_memory_bytes", nodeName, value)
+					ch <- newKubeNodeStatusCapacityMemoryBytesMetric("kube_node_status_capacity_memory_bytes", nodeName, nodeUID, value)
 				}
 			}
 
 			if _, disabled := disabledMetrics["kube_node_status_capacity"]; !disabled {
-				ch <- newKubeNodeStatusCapacityMetric("kube_node_status_capacity", nodeName, resource, unit, value)
+				ch <- newKubeNodeStatusCapacityMetric("kube_node_status_capacity", nodeName, resource, unit, nodeUID, value)
 			}
 		}
 
@@ -105,23 +106,23 @@ func (nsac KubeNodeCollector) Collect(ch chan<- prometheus.Metric) {
 			// KSM v1 Emission
 			if _, disabled := disabledMetrics["kube_node_status_allocatable_cpu_cores"]; !disabled {
 				if resource == "cpu" {
-					ch <- newKubeNodeStatusAllocatableCPUCoresMetric("kube_node_status_allocatable_cpu_cores", nodeName, value)
+					ch <- newKubeNodeStatusAllocatableCPUCoresMetric("kube_node_status_allocatable_cpu_cores", nodeName, value, nodeUID)
 				}
 			}
 			if _, disabled := disabledMetrics["kube_node_status_allocatable_memory_bytes"]; !disabled {
 				if resource == "memory" {
-					ch <- newKubeNodeStatusAllocatableMemoryBytesMetric("kube_node_status_allocatable_memory_bytes", nodeName, value)
+					ch <- newKubeNodeStatusAllocatableMemoryBytesMetric("kube_node_status_allocatable_memory_bytes", nodeName, value, nodeUID)
 				}
 			}
 			if _, disabled := disabledMetrics["kube_node_status_allocatable"]; !disabled {
-				ch <- newKubeNodeStatusAllocatableMetric("kube_node_status_allocatable", nodeName, resource, unit, value)
+				ch <- newKubeNodeStatusAllocatableMetric("kube_node_status_allocatable", nodeName, resource, unit, value, nodeUID)
 			}
 		}
 
 		// node labels
 		if _, disabled := disabledMetrics["kube_node_labels"]; !disabled {
 			labelNames, labelValues := promutil.KubePrependQualifierToLabels(promutil.SanitizeLabels(node.Labels), "label_")
-			ch <- newKubeNodeLabelsMetric(nodeName, "kube_node_labels", labelNames, labelValues)
+			ch <- newKubeNodeLabelsMetric(nodeName, "kube_node_labels", labelNames, labelValues, nodeUID)
 		}
 
 		// kube_node_status_condition
@@ -131,7 +132,7 @@ func (nsac KubeNodeCollector) Collect(ch chan<- prometheus.Metric) {
 				conditions := getConditions(c.Status)
 
 				for _, cond := range conditions {
-					ch <- newKubeNodeStatusConditionMetric(nodeName, "kube_node_status_condition", string(c.Type), cond.status, cond.value)
+					ch <- newKubeNodeStatusConditionMetric(nodeName, "kube_node_status_condition", string(c.Type), cond.status, cond.value, nodeUID)
 				}
 			}
 		}
@@ -150,10 +151,11 @@ type KubeNodeStatusCapacityMetric struct {
 	unit     string
 	node     string
 	value    float64
+	uid      string
 }
 
 // Creates a new KubeNodeStatusCapacityMetric, implementation of prometheus.Metric
-func newKubeNodeStatusCapacityMetric(fqname, node, resource, unit string, value float64) KubeNodeStatusCapacityMetric {
+func newKubeNodeStatusCapacityMetric(fqname, node, resource, unit, uid string, value float64) KubeNodeStatusCapacityMetric {
 	return KubeNodeStatusCapacityMetric{
 		fqName:   fqname,
 		help:     "kube_node_status_capacity node capacity",
@@ -161,6 +163,7 @@ func newKubeNodeStatusCapacityMetric(fqname, node, resource, unit string, value
 		resource: resource,
 		unit:     unit,
 		value:    value,
+		uid:      uid,
 	}
 }
 
@@ -171,6 +174,7 @@ func (kpcrr KubeNodeStatusCapacityMetric) Desc() *prometheus.Desc {
 		"node":     kpcrr.node,
 		"resource": kpcrr.resource,
 		"unit":     kpcrr.unit,
+		"uid":      kpcrr.uid,
 	}
 	return prometheus.NewDesc(kpcrr.fqName, kpcrr.help, []string{}, l)
 }
@@ -194,6 +198,10 @@ func (kpcrr KubeNodeStatusCapacityMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("unit"),
 			Value: &kpcrr.unit,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpcrr.uid,
+		},
 	}
 	return nil
 }
@@ -210,22 +218,24 @@ type KubeNodeStatusCapacityMemoryBytesMetric struct {
 	help   string
 	bytes  float64
 	node   string
+	uid    string
 }
 
 // Creates a new KubeNodeStatusCapacityMemoryBytesMetric, implementation of prometheus.Metric
-func newKubeNodeStatusCapacityMemoryBytesMetric(fqname string, node string, bytes float64) KubeNodeStatusCapacityMemoryBytesMetric {
+func newKubeNodeStatusCapacityMemoryBytesMetric(fqname string, node string, uid string, bytes float64) KubeNodeStatusCapacityMemoryBytesMetric {
 	return KubeNodeStatusCapacityMemoryBytesMetric{
 		fqName: fqname,
 		help:   "kube_node_status_capacity_memory_bytes Node Capacity Memory Bytes",
 		node:   node,
 		bytes:  bytes,
+		uid:    uid,
 	}
 }
 
 // Desc returns the descriptor for the Metric. This method idempotently
 // returns the same descriptor throughout the lifetime of the Metric.
 func (nam KubeNodeStatusCapacityMemoryBytesMetric) Desc() *prometheus.Desc {
-	l := prometheus.Labels{"node": nam.node}
+	l := prometheus.Labels{"node": nam.node, "uid": nam.uid}
 	return prometheus.NewDesc(nam.fqName, nam.help, []string{}, l)
 }
 
@@ -240,6 +250,10 @@ func (nam KubeNodeStatusCapacityMemoryBytesMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("node"),
 			Value: &nam.node,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &nam.uid,
+		},
 	}
 	return nil
 }
@@ -256,22 +270,24 @@ type KubeNodeStatusCapacityCPUCoresMetric struct {
 	help   string
 	cores  float64
 	node   string
+	uid    string
 }
 
 // Creates a new KubeNodeStatusCapacityCPUCoresMetric, implementation of prometheus.Metric
-func newKubeNodeStatusCapacityCPUCoresMetric(fqname string, node string, cores float64) KubeNodeStatusCapacityCPUCoresMetric {
+func newKubeNodeStatusCapacityCPUCoresMetric(fqname string, node string, uid string, cores float64) KubeNodeStatusCapacityCPUCoresMetric {
 	return KubeNodeStatusCapacityCPUCoresMetric{
 		fqName: fqname,
 		help:   "kube_node_status_capacity_cpu_cores Node Capacity CPU Cores",
 		cores:  cores,
 		node:   node,
+		uid:    uid,
 	}
 }
 
 // Desc returns the descriptor for the Metric. This method idempotently
 // returns the same descriptor throughout the lifetime of the Metric.
 func (nam KubeNodeStatusCapacityCPUCoresMetric) Desc() *prometheus.Desc {
-	l := prometheus.Labels{"node": nam.node}
+	l := prometheus.Labels{"node": nam.node, "uid": nam.uid}
 	return prometheus.NewDesc(nam.fqName, nam.help, []string{}, l)
 }
 
@@ -286,6 +302,10 @@ func (nam KubeNodeStatusCapacityCPUCoresMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("node"),
 			Value: &nam.node,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &nam.uid,
+		},
 	}
 	return nil
 }
@@ -303,16 +323,18 @@ type KubeNodeLabelsMetric struct {
 	labelNames  []string
 	labelValues []string
 	node        string
+	uid         string
 }
 
 // Creates a new KubeNodeLabelsMetric, implementation of prometheus.Metric
-func newKubeNodeLabelsMetric(node string, fqname string, labelNames []string, labelValues []string) KubeNodeLabelsMetric {
+func newKubeNodeLabelsMetric(node string, fqname string, labelNames []string, labelValues []string, uid string) KubeNodeLabelsMetric {
 	return KubeNodeLabelsMetric{
 		fqName:      fqname,
 		labelNames:  labelNames,
 		labelValues: labelValues,
 		help:        "kube_node_labels all labels for each node prefixed with label_",
 		node:        node,
+		uid:         uid,
 	}
 }
 
@@ -321,6 +343,7 @@ func newKubeNodeLabelsMetric(node string, fqname string, labelNames []string, la
 func (nam KubeNodeLabelsMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"node": nam.node,
+		"uid":  nam.uid,
 	}
 	return prometheus.NewDesc(nam.fqName, nam.help, nam.labelNames, l)
 }
@@ -343,6 +366,8 @@ func (nam KubeNodeLabelsMetric) Write(m *dto.Metric) error {
 
 	nodeString := "node"
 	labels = append(labels, &dto.LabelPair{Name: &nodeString, Value: &nam.node})
+	uidString := "uid"
+	labels = append(labels, &dto.LabelPair{Name: &uidString, Value: &nam.uid})
 	m.Label = labels
 	return nil
 }
@@ -359,10 +384,11 @@ type KubeNodeStatusConditionMetric struct {
 	condition string
 	status    string
 	value     float64
+	uid       string
 }
 
 // Creates a new KubeNodeStatusConditionMetric, implementation of prometheus.Metric
-func newKubeNodeStatusConditionMetric(node, fqname, condition, status string, value float64) KubeNodeStatusConditionMetric {
+func newKubeNodeStatusConditionMetric(node, fqname, condition, status string, value float64, uid string) KubeNodeStatusConditionMetric {
 	return KubeNodeStatusConditionMetric{
 		fqName:    fqname,
 		help:      "kube_node_status_condition condition status for nodes",
@@ -370,6 +396,7 @@ func newKubeNodeStatusConditionMetric(node, fqname, condition, status string, va
 		condition: condition,
 		status:    status,
 		value:     value,
+		uid:       uid,
 	}
 }
 
@@ -380,6 +407,7 @@ func (nam KubeNodeStatusConditionMetric) Desc() *prometheus.Desc {
 		"node":      nam.node,
 		"condition": nam.condition,
 		"status":    nam.status,
+		"uid":       nam.uid,
 	}
 	return prometheus.NewDesc(nam.fqName, nam.help, []string{}, l)
 }
@@ -403,6 +431,10 @@ func (nam KubeNodeStatusConditionMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("status"),
 			Value: &nam.status,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &nam.uid,
+		},
 	}
 	return nil
 }
@@ -439,10 +471,11 @@ type KubeNodeStatusAllocatableMetric struct {
 	unit     string
 	node     string
 	value    float64
+	uid      string
 }
 
 // Creates a new KubeNodeStatusAllocatableMetric, implementation of prometheus.Metric
-func newKubeNodeStatusAllocatableMetric(fqname, node, resource, unit string, value float64) KubeNodeStatusAllocatableMetric {
+func newKubeNodeStatusAllocatableMetric(fqname, node, resource, unit string, value float64, uid string) KubeNodeStatusAllocatableMetric {
 	return KubeNodeStatusAllocatableMetric{
 		fqName:   fqname,
 		help:     "kube_node_status_allocatable node allocatable",
@@ -450,6 +483,7 @@ func newKubeNodeStatusAllocatableMetric(fqname, node, resource, unit string, val
 		resource: resource,
 		unit:     unit,
 		value:    value,
+		uid:      uid,
 	}
 }
 
@@ -460,6 +494,7 @@ func (kpcrr KubeNodeStatusAllocatableMetric) Desc() *prometheus.Desc {
 		"node":     kpcrr.node,
 		"resource": kpcrr.resource,
 		"unit":     kpcrr.unit,
+		"uid":      kpcrr.uid,
 	}
 	return prometheus.NewDesc(kpcrr.fqName, kpcrr.help, []string{}, l)
 }
@@ -483,6 +518,10 @@ func (kpcrr KubeNodeStatusAllocatableMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("unit"),
 			Value: &kpcrr.unit,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpcrr.uid,
+		},
 	}
 	return nil
 }
@@ -499,15 +538,17 @@ type KubeNodeStatusAllocatableCPUCoresMetric struct {
 	unit     string
 	node     string
 	value    float64
+	uid      string
 }
 
 // Creates a new KubeNodeStatusAllocatableCPUCoresMetric, implementation of prometheus.Metric
-func newKubeNodeStatusAllocatableCPUCoresMetric(fqname, node string, value float64) KubeNodeStatusAllocatableCPUCoresMetric {
+func newKubeNodeStatusAllocatableCPUCoresMetric(fqname, node string, value float64, uid string) KubeNodeStatusAllocatableCPUCoresMetric {
 	return KubeNodeStatusAllocatableCPUCoresMetric{
 		fqName: fqname,
 		help:   "kube_node_status_allocatable_cpu_cores node allocatable cpu cores",
 		node:   node,
 		value:  value,
+		uid:    uid,
 	}
 }
 
@@ -516,6 +557,7 @@ func newKubeNodeStatusAllocatableCPUCoresMetric(fqname, node string, value float
 func (kpcrr KubeNodeStatusAllocatableCPUCoresMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"node": kpcrr.node,
+		"uid":  kpcrr.uid,
 	}
 	return prometheus.NewDesc(kpcrr.fqName, kpcrr.help, []string{}, l)
 }
@@ -531,6 +573,10 @@ func (kpcrr KubeNodeStatusAllocatableCPUCoresMetric) Write(m *dto.Metric) error
 			Name:  toStringPtr("node"),
 			Value: &kpcrr.node,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpcrr.uid,
+		},
 	}
 	return nil
 }
@@ -547,15 +593,17 @@ type KubeNodeStatusAllocatableMemoryBytesMetric struct {
 	unit     string
 	node     string
 	value    float64
+	uid      string
 }
 
 // Creates a new KubeNodeStatusAllocatableMemoryBytesMetric, implementation of prometheus.Metric
-func newKubeNodeStatusAllocatableMemoryBytesMetric(fqname, node string, value float64) KubeNodeStatusAllocatableMemoryBytesMetric {
+func newKubeNodeStatusAllocatableMemoryBytesMetric(fqname, node string, value float64, uid string) KubeNodeStatusAllocatableMemoryBytesMetric {
 	return KubeNodeStatusAllocatableMemoryBytesMetric{
 		fqName: fqname,
 		help:   "kube_node_status_allocatable_memory_bytes node allocatable memory in bytes",
 		node:   node,
 		value:  value,
+		uid:    uid,
 	}
 }
 
@@ -564,6 +612,7 @@ func newKubeNodeStatusAllocatableMemoryBytesMetric(fqname, node string, value fl
 func (kpcrr KubeNodeStatusAllocatableMemoryBytesMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"node": kpcrr.node,
+		"uid":  kpcrr.uid,
 	}
 	return prometheus.NewDesc(kpcrr.fqName, kpcrr.help, []string{}, l)
 }
@@ -579,6 +628,10 @@ func (kpcrr KubeNodeStatusAllocatableMemoryBytesMetric) Write(m *dto.Metric) err
 			Name:  toStringPtr("node"),
 			Value: &kpcrr.node,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpcrr.uid,
+		},
 	}
 	return nil
 }

+ 454 - 0
pkg/metrics/nodemetrics_test.go

@@ -0,0 +1,454 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+func TestKubeNodeCollector_Describe(t *testing.T) {
+	tests := []struct {
+		name            string
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name:            "all metrics enabled",
+			disabledMetrics: []string{},
+			expectedCount:   8,
+		},
+		{
+			name:            "capacity metric disabled",
+			disabledMetrics: []string{"kube_node_status_capacity"},
+			expectedCount:   7,
+		},
+		{
+			name:            "all metrics disabled",
+			disabledMetrics: []string{"kube_node_status_capacity", "kube_node_status_capacity_memory_bytes", "kube_node_status_capacity_cpu_cores", "kube_node_status_allocatable", "kube_node_status_allocatable_cpu_cores", "kube_node_status_allocatable_memory_bytes", "kube_node_labels", "kube_node_status_condition"},
+			expectedCount:   0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			nc := KubeNodeCollector{
+				KubeClusterCache: NewFakeNodeCache([]*clustercache.Node{}),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan *prometheus.Desc, 10)
+			nc.Describe(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestKubeNodeCollector_Collect(t *testing.T) {
+	tests := []struct {
+		name            string
+		nodes           []*clustercache.Node
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name: "single node with resources",
+			nodes: []*clustercache.Node{
+				{
+					UID:  types.UID("node-uid-1"),
+					Name: "node-1",
+					Labels: map[string]string{
+						"app": "test",
+					},
+					Status: v1.NodeStatus{
+						Capacity: v1.ResourceList{
+							v1.ResourceCPU:    resource.MustParse("4"),
+							v1.ResourceMemory: resource.MustParse("8Gi"),
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourceCPU:    resource.MustParse("3.8"),
+							v1.ResourceMemory: resource.MustParse("7.5Gi"),
+						},
+						Conditions: []v1.NodeCondition{
+							{
+								Type:   v1.NodeReady,
+								Status: v1.ConditionTrue,
+							},
+						},
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   12, // 2 capacity + 2 capacity specific + 2 allocatable + 2 allocatable specific + 1 labels + 3 conditions
+		},
+		{
+			name: "multiple_nodes",
+			nodes: []*clustercache.Node{
+				{
+					Name:   "node-1",
+					Labels: map[string]string{}, // Empty labels to avoid label metrics
+					Status: v1.NodeStatus{
+						Capacity: v1.ResourceList{
+							v1.ResourceCPU:    resource.MustParse("4"),
+							v1.ResourceMemory: resource.MustParse("8Gi"),
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourceCPU:    resource.MustParse("3"),
+							v1.ResourceMemory: resource.MustParse("7Gi"),
+						},
+						Conditions: []v1.NodeCondition{}, // Empty conditions to avoid condition metrics
+					},
+					UID: types.UID("test-node-1-uid"),
+				},
+				{
+					Name:   "node-2",
+					Labels: map[string]string{}, // Empty labels to avoid label metrics
+					Status: v1.NodeStatus{
+						Capacity: v1.ResourceList{
+							v1.ResourceCPU:    resource.MustParse("4"),
+							v1.ResourceMemory: resource.MustParse("8Gi"),
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourceCPU:    resource.MustParse("3"),
+							v1.ResourceMemory: resource.MustParse("7Gi"),
+						},
+						Conditions: []v1.NodeCondition{}, // Empty conditions to avoid condition metrics
+					},
+					UID: types.UID("test-node-2-uid"),
+				},
+			},
+
+			expectedCount: 18, // 9 metrics per node × 2 nodes
+		},
+		{
+			name:            "no nodes",
+			nodes:           []*clustercache.Node{},
+			disabledMetrics: []string{},
+			expectedCount:   0,
+		},
+		{
+			name: "metrics disabled",
+			nodes: []*clustercache.Node{
+				{
+					UID:  types.UID("node-uid-1"),
+					Name: "node-1",
+					Status: v1.NodeStatus{
+						Capacity: v1.ResourceList{
+							v1.ResourceCPU: resource.MustParse("2"),
+						},
+					},
+				},
+			},
+			disabledMetrics: []string{"kube_node_status_capacity", "kube_node_status_capacity_cpu_cores", "kube_node_labels"},
+			expectedCount:   0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			nc := KubeNodeCollector{
+				KubeClusterCache: NewFakeNodeCache(tt.nodes),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan prometheus.Metric, 20)
+			nc.Collect(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestKubeNodeStatusCapacityMetric(t *testing.T) {
+	metric := newKubeNodeStatusCapacityMetric("kube_node_status_capacity", "test-node", "cpu", "core", "test-uid", 4.0)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 4.0 {
+		t.Errorf("Expected gauge value 4.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"node":     "test-node",
+		"resource": "cpu",
+		"unit":     "core",
+		"uid":      "test-uid",
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Expected label %s=%s, got %s=%s", key, expectedValue, key, actualValue)
+		}
+	}
+}
+
+func TestKubeNodeLabelsMetric(t *testing.T) {
+	labelNames := []string{"app", "version"}
+	labelValues := []string{"test-app", "v1.0"}
+	uid := "test-uid"
+
+	metric := newKubeNodeLabelsMetric("test-node", "kube_node_labels", labelNames, labelValues, uid)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"app":     "test-app",
+		"version": "v1.0",
+		"node":    "test-node",
+		"uid":     uid,
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Expected label %s=%s, got %s=%s", key, expectedValue, key, actualValue)
+		}
+	}
+}
+
+func TestKubeNodeStatusConditionMetric(t *testing.T) {
+	metric := newKubeNodeStatusConditionMetric("test-node", "kube_node_status_condition", "Ready", "true", 1.0, "test-uid")
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"node":      "test-node",
+		"condition": "Ready",
+		"status":    "true",
+		"uid":       "test-uid",
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Expected label %s=%s, got %s=%s", key, expectedValue, key, actualValue)
+		}
+	}
+}
+
+func TestKubeNodeStatusCapacityMemoryBytesMetric(t *testing.T) {
+	metric := newKubeNodeStatusCapacityMemoryBytesMetric("kube_node_status_capacity_memory_bytes", "test-node", "test-uid", 8589934592.0)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 8589934592.0 {
+		t.Errorf("Expected gauge value 8589934592.0, got %f", *dtoMetric.Gauge.Value)
+	}
+}
+
+func TestKubeNodeStatusCapacityCPUCoresMetric(t *testing.T) {
+	metric := newKubeNodeStatusCapacityCPUCoresMetric("kube_node_status_capacity_cpu_cores", "test-node", "test-uid", 4.0)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 4.0 {
+		t.Errorf("Expected gauge value 4.0, got %f", *dtoMetric.Gauge.Value)
+	}
+}
+
+func TestGetConditions(t *testing.T) {
+	tests := []struct {
+		name           string
+		status         v1.ConditionStatus
+		expectedValues map[string]float64
+	}{
+		{
+			name:   "condition true",
+			status: v1.ConditionTrue,
+			expectedValues: map[string]float64{
+				"true":    1.0,
+				"false":   0.0,
+				"unknown": 0.0,
+			},
+		},
+		{
+			name:   "condition false",
+			status: v1.ConditionFalse,
+			expectedValues: map[string]float64{
+				"true":    0.0,
+				"false":   1.0,
+				"unknown": 0.0,
+			},
+		},
+		{
+			name:   "condition unknown",
+			status: v1.ConditionUnknown,
+			expectedValues: map[string]float64{
+				"true":    0.0,
+				"false":   0.0,
+				"unknown": 1.0,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			conditions := getConditions(tt.status)
+
+			if len(conditions) != 3 {
+				t.Errorf("Expected 3 conditions, got %d", len(conditions))
+			}
+
+			actualValues := make(map[string]float64)
+			for _, cond := range conditions {
+				actualValues[cond.status] = cond.value
+			}
+
+			for status, expectedValue := range tt.expectedValues {
+				if actualValue, ok := actualValues[status]; !ok {
+					t.Errorf("Missing status %s", status)
+				} else if actualValue != expectedValue {
+					t.Errorf("Expected status %s=%f, got %f", status, expectedValue, actualValue)
+				}
+			}
+		})
+	}
+}
+
+// FakeNodeCache implements ClusterCache interface for testing
+type FakeNodeCache struct {
+	clustercache.ClusterCache
+	nodes []*clustercache.Node
+}
+
+func (f FakeNodeCache) GetAllNodes() []*clustercache.Node {
+	return f.nodes
+}
+
+func NewFakeNodeCache(nodes []*clustercache.Node) FakeNodeCache {
+	return FakeNodeCache{
+		nodes: nodes,
+	}
+}

+ 10 - 2
pkg/metrics/podmetrics.go

@@ -44,11 +44,12 @@ func (kpmc KubecostPodCollector) Collect(ch chan<- prometheus.Metric) {
 	for _, pod := range pods {
 		podName := pod.Name
 		podNS := pod.Namespace
+		podUID := string(pod.UID)
 
 		// Pod Annotations
 		labels, values := promutil.KubeAnnotationsToLabels(pod.Annotations)
 		if len(labels) > 0 {
-			ch <- newPodAnnotationMetric("kube_pod_annotations", podNS, podName, labels, values)
+			ch <- newPodAnnotationMetric("kube_pod_annotations", podNS, podName, podUID, labels, values)
 		}
 	}
 
@@ -258,17 +259,19 @@ type PodAnnotationsMetric struct {
 	help        string
 	namespace   string
 	pod         string
+	uid         string
 	labelNames  []string
 	labelValues []string
 }
 
 // Creates a new PodAnnotationsMetric, implementation of prometheus.Metric
-func newPodAnnotationMetric(fqname, namespace, pod string, labelNames, labelValues []string) PodAnnotationsMetric {
+func newPodAnnotationMetric(fqname, namespace, pod string, uid string, labelNames, labelValues []string) PodAnnotationsMetric {
 	return PodAnnotationsMetric{
 		fqName:      fqname,
 		help:        "kube_pod_annotations Pod Annotations",
 		namespace:   namespace,
 		pod:         pod,
+		uid:         uid,
 		labelNames:  labelNames,
 		labelValues: labelValues,
 	}
@@ -280,6 +283,7 @@ func (pam PodAnnotationsMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"namespace": pam.namespace,
 		"pod":       pam.pod,
+		"uid":       pam.uid,
 	}
 	return prometheus.NewDesc(pam.fqName, pam.help, []string{}, l)
 }
@@ -307,6 +311,10 @@ func (pam PodAnnotationsMetric) Write(m *dto.Metric) error {
 		&dto.LabelPair{
 			Name:  toStringPtr("pod"),
 			Value: &pam.pod,
+		},
+		&dto.LabelPair{
+			Name:  toStringPtr("uid"),
+			Value: &pam.uid,
 		})
 	m.Label = labels
 	return nil

+ 862 - 0
pkg/metrics/podmetrics_test.go

@@ -0,0 +1,862 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+func TestKubecostPodCollector_Describe(t *testing.T) {
+	tests := []struct {
+		name            string
+		disabledMetrics []string
+		expectMetric    bool
+	}{
+		{
+			name:            "annotations enabled",
+			disabledMetrics: []string{},
+			expectMetric:    true,
+		},
+		{
+			name:            "annotations disabled",
+			disabledMetrics: []string{"kube_pod_annotations"},
+			expectMetric:    false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			kpc := KubecostPodCollector{
+				KubeClusterCache: NewFakePodCache([]*clustercache.Pod{}),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan *prometheus.Desc, 10)
+			kpc.Describe(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if tt.expectMetric && count == 0 {
+				t.Error("Expected metric description but got none")
+			}
+			if !tt.expectMetric && count > 0 {
+				t.Error("Expected no metric description but got some")
+			}
+		})
+	}
+}
+
+func TestKubecostPodCollector_Collect(t *testing.T) {
+	tests := []struct {
+		name            string
+		pods            []*clustercache.Pod
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name: "pod with annotations",
+			pods: []*clustercache.Pod{
+				{
+					UID:       types.UID("pod-uid-1"),
+					Name:      "test-pod",
+					Namespace: "default",
+					Annotations: map[string]string{
+						"prometheus.io/scrape": "true",
+						"prometheus.io/port":   "8080",
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   1,
+		},
+		{
+			name: "pod without annotations",
+			pods: []*clustercache.Pod{
+				{
+					UID:         types.UID("pod-uid-2"),
+					Name:        "empty-pod",
+					Namespace:   "default",
+					Annotations: map[string]string{},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   0,
+		},
+		{
+			name: "multiple pods with mixed annotations",
+			pods: []*clustercache.Pod{
+				{
+					UID:         types.UID("pod-uid-3"),
+					Name:        "pod1",
+					Namespace:   "ns1",
+					Annotations: map[string]string{"key": "value"},
+				},
+				{
+					UID:         types.UID("pod-uid-4"),
+					Name:        "pod2",
+					Namespace:   "ns1",
+					Annotations: map[string]string{},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   1,
+		},
+		{
+			name: "metric disabled",
+			pods: []*clustercache.Pod{
+				{
+					UID:         types.UID("pod-uid-5"),
+					Name:        "test-pod",
+					Namespace:   "default",
+					Annotations: map[string]string{"test": "annotation"},
+				},
+			},
+			disabledMetrics: []string{"kube_pod_annotations"},
+			expectedCount:   0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			kpc := KubecostPodCollector{
+				KubeClusterCache: NewFakePodCache(tt.pods),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan prometheus.Metric, 10)
+			kpc.Collect(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestPodAnnotationMetric(t *testing.T) {
+	labelNames := []string{"annotation_key1", "annotation_key2"}
+	labelValues := []string{"value1", "value2"}
+
+	metric := newPodAnnotationMetric("kube_pod_annotations", "test-ns", "test-pod", "test-uid", labelNames, labelValues)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"annotation_key1": "value1",
+		"annotation_key2": "value2",
+		"namespace":       "test-ns",
+		"pod":             "test-pod",
+		"uid":             "test-uid",
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestKubePodCollector_Describe(t *testing.T) {
+	tests := []struct {
+		name            string
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name:            "all metrics enabled",
+			disabledMetrics: []string{},
+			expectedCount:   10,
+		},
+		{
+			name: "some metrics disabled",
+			disabledMetrics: []string{
+				"kube_pod_labels",
+				"kube_pod_owner",
+				"kube_pod_container_status_running",
+			},
+			expectedCount: 7,
+		},
+		{
+			name: "all metrics disabled",
+			disabledMetrics: []string{
+				"kube_pod_labels",
+				"kube_pod_owner",
+				"kube_pod_container_status_running",
+				"kube_pod_container_status_terminated_reason",
+				"kube_pod_container_status_restarts_total",
+				"kube_pod_container_resource_requests",
+				"kube_pod_container_resource_limits",
+				"kube_pod_container_resource_limits_cpu_cores",
+				"kube_pod_container_resource_limits_memory_bytes",
+				"kube_pod_status_phase",
+			},
+			expectedCount: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			kpc := KubePodCollector{
+				KubeClusterCache: NewFakePodCache([]*clustercache.Pod{}),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan *prometheus.Desc, 15)
+			kpc.Describe(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestKubePodCollector_Collect(t *testing.T) {
+	boolTrue := true
+	tests := []struct {
+		name            string
+		pods            []*clustercache.Pod
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name: "pod with all features",
+			pods: []*clustercache.Pod{
+				{
+					UID:       types.UID("pod-uid-1"),
+					Name:      "test-pod",
+					Namespace: "default",
+					Labels: map[string]string{
+						"app":     "test",
+						"version": "v1",
+					},
+					OwnerReferences: []metav1.OwnerReference{
+						{
+							Name:       "test-deployment",
+							Kind:       "Deployment",
+							Controller: &boolTrue,
+						},
+					},
+					Status: clustercache.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{
+								Name:         "container1",
+								RestartCount: 2,
+								State: v1.ContainerState{
+									Running: &v1.ContainerStateRunning{},
+								},
+							},
+						},
+					},
+					Spec: clustercache.PodSpec{
+						NodeName: "node1",
+						Containers: []clustercache.Container{
+							{
+								Name: "container1",
+								Resources: v1.ResourceRequirements{
+									Requests: v1.ResourceList{
+										v1.ResourceCPU:    resource.MustParse("100m"),
+										v1.ResourceMemory: resource.MustParse("128Mi"),
+									},
+									Limits: v1.ResourceList{
+										v1.ResourceCPU:    resource.MustParse("200m"),
+										v1.ResourceMemory: resource.MustParse("256Mi"),
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   15, // 5 phases + 1 labels + 1 owner + 1 restarts + 1 running + 2 requests + 4 limits
+		},
+		{
+			name: "pod without containers",
+			pods: []*clustercache.Pod{
+				{
+					UID:       types.UID("pod-uid-2"),
+					Name:      "empty-pod",
+					Namespace: "default",
+					Labels:    map[string]string{"test": "label"},
+					Status: clustercache.PodStatus{
+						Phase: v1.PodPending,
+					},
+					Spec: clustercache.PodSpec{
+						Containers: []clustercache.Container{},
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   6, // 5 phases + 1 labels
+		},
+		{
+			name: "pod with terminated container",
+			pods: []*clustercache.Pod{
+				{
+					UID:       types.UID("pod-uid-3"),
+					Name:      "terminated-pod",
+					Namespace: "default",
+					Labels:    map[string]string{},
+					Status: clustercache.PodStatus{
+						Phase: v1.PodFailed,
+						ContainerStatuses: []v1.ContainerStatus{
+							{
+								Name:         "failed-container",
+								RestartCount: 5,
+								State: v1.ContainerState{
+									Terminated: &v1.ContainerStateTerminated{
+										Reason: "OOMKilled",
+									},
+								},
+							},
+						},
+					},
+					Spec: clustercache.PodSpec{
+						Containers: []clustercache.Container{
+							{
+								Name:      "failed-container",
+								Resources: v1.ResourceRequirements{},
+							},
+						},
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   8, // 5 phases + 1 labels + 1 restarts + 1 terminated reason
+		},
+		{
+			name: "pod without phase",
+			pods: []*clustercache.Pod{
+				{
+					UID:       types.UID("pod-uid-4"),
+					Name:      "no-phase-pod",
+					Namespace: "default",
+					Labels:    map[string]string{"app": "test"},
+					Status: clustercache.PodStatus{
+						Phase: "", // Empty phase
+					},
+					Spec: clustercache.PodSpec{},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   1, // Only labels
+		},
+		{
+			name: "multiple containers",
+			pods: []*clustercache.Pod{
+				{
+					UID:       types.UID("pod-uid-5"),
+					Name:      "multi-container-pod",
+					Namespace: "default",
+					Labels:    map[string]string{},
+					Status: clustercache.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{
+								Name:         "container1",
+								RestartCount: 0,
+								State: v1.ContainerState{
+									Running: &v1.ContainerStateRunning{},
+								},
+							},
+							{
+								Name:         "container2",
+								RestartCount: 1,
+								State: v1.ContainerState{
+									Running: &v1.ContainerStateRunning{},
+								},
+							},
+						},
+					},
+					Spec: clustercache.PodSpec{
+						NodeName: "node2",
+						Containers: []clustercache.Container{
+							{
+								Name: "container1",
+								Resources: v1.ResourceRequirements{
+									Requests: v1.ResourceList{
+										v1.ResourceCPU: resource.MustParse("50m"),
+									},
+									Limits: v1.ResourceList{
+										v1.ResourceCPU: resource.MustParse("100m"),
+									},
+								},
+							},
+							{
+								Name: "container2",
+								Resources: v1.ResourceRequirements{
+									Requests: v1.ResourceList{
+										v1.ResourceMemory: resource.MustParse("64Mi"),
+									},
+									Limits: v1.ResourceList{
+										v1.ResourceMemory: resource.MustParse("128Mi"),
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   16, // 5 phases + 1 labels + 2 restarts + 2 running + 2 requests + 4 limits
+		},
+		{
+			name: "metrics disabled",
+			pods: []*clustercache.Pod{
+				{
+					UID:       types.UID("pod-uid-6"),
+					Name:      "test-pod",
+					Namespace: "default",
+					Labels:    map[string]string{"app": "test"},
+					Status: clustercache.PodStatus{
+						Phase: v1.PodRunning,
+					},
+					Spec: clustercache.PodSpec{},
+				},
+			},
+			disabledMetrics: []string{"kube_pod_labels", "kube_pod_status_phase"},
+			expectedCount:   0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			kpc := KubePodCollector{
+				KubeClusterCache: NewFakePodCache(tt.pods),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan prometheus.Metric, 30)
+			kpc.Collect(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestKubePodLabelsMetric(t *testing.T) {
+	labelNames := []string{"label_app", "label_env"}
+	labelValues := []string{"webapp", "production"}
+
+	metric := newKubePodLabelsMetric("kube_pod_labels", "prod", "web-pod", "pod-uid", labelNames, labelValues)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"label_app": "webapp",
+		"label_env": "production",
+		"namespace": "prod",
+		"pod":       "web-pod",
+		"uid":       "pod-uid",
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestKubePodContainerStatusRestartsTotalMetric(t *testing.T) {
+	metric := newKubePodContainerStatusRestartsTotalMetric("kube_pod_container_status_restarts_total", "default", "test-pod", "pod-uid", "app-container", 3.0)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Counter == nil {
+		t.Error("Expected counter metric")
+	}
+
+	if *dtoMetric.Counter.Value != 3.0 {
+		t.Errorf("Expected counter value 3.0, got %f", *dtoMetric.Counter.Value)
+	}
+}
+
+func TestKubePodContainerStatusTerminatedReasonMetric(t *testing.T) {
+	metric := newKubePodContainerStatusTerminatedReasonMetric("kube_pod_container_status_terminated_reason", "default", "crashed-pod", "pod-uid", "failing-container", "Error")
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Check for reason label
+	hasReason := false
+	for _, label := range dtoMetric.Label {
+		if *label.Name == "reason" && *label.Value == "Error" {
+			hasReason = true
+			break
+		}
+	}
+	if !hasReason {
+		t.Error("Expected reason label with value 'Error'")
+	}
+}
+
+func TestKubePodStatusPhaseMetric(t *testing.T) {
+	metric := newKubePodStatusPhaseMetric("kube_pod_status_phase", "default", "test-pod", "pod-uid", "Running", 1.0)
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	// Check phase label
+	hasPhase := false
+	for _, label := range dtoMetric.Label {
+		if *label.Name == "phase" && *label.Value == "Running" {
+			hasPhase = true
+			break
+		}
+	}
+	if !hasPhase {
+		t.Error("Expected phase label with value 'Running'")
+	}
+}
+
+func TestKubePodContainerStatusRunningMetric(t *testing.T) {
+	metric := newKubePodContainerStatusRunningMetric("kube_pod_container_status_running", "default", "running-pod", "pod-uid", "web-container")
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+}
+
+func TestKubePodContainerResourceRequestsMetric(t *testing.T) {
+	metric := newKubePodContainerResourceRequestsMetric("kube_pod_container_resource_requests", "default", "test-pod", "pod-uid", "container1", "node1", "cpu", "core", 0.1)
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 0.1 {
+		t.Errorf("Expected gauge value 0.1, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify all labels
+	expectedLabels := map[string]string{
+		"namespace": "default",
+		"pod":       "test-pod",
+		"container": "container1",
+		"uid":       "pod-uid",
+		"node":      "node1",
+		"resource":  "cpu",
+		"unit":      "core",
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestKubePodContainerResourceLimitsMetric(t *testing.T) {
+	metric := newKubePodContainerResourceLimitsMetric("kube_pod_container_resource_limits", "default", "test-pod", "pod-uid", "container1", "node1", "memory", "byte", 268435456)
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 268435456 {
+		t.Errorf("Expected gauge value 268435456, got %f", *dtoMetric.Gauge.Value)
+	}
+}
+
+func TestKubePodContainerResourceLimitsCPUCoresMetric(t *testing.T) {
+	metric := newKubePodContainerResourceLimitsCPUCoresMetric("kube_pod_container_resource_limits_cpu_cores", "default", "test-pod", "pod-uid", "container1", "node1", 2.0)
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 2.0 {
+		t.Errorf("Expected gauge value 2.0, got %f", *dtoMetric.Gauge.Value)
+	}
+}
+
+func TestKubePodContainerResourceLimitsMemoryBytesMetric(t *testing.T) {
+	metric := newKubePodContainerResourceLimitsMemoryBytesMetric("kube_pod_container_resource_limits_memory_bytes", "default", "test-pod", "pod-uid", "container1", "node1", 536870912)
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 536870912 {
+		t.Errorf("Expected gauge value 536870912, got %f", *dtoMetric.Gauge.Value)
+	}
+}
+
+func TestKubePodOwnerMetric(t *testing.T) {
+	metric := newKubePodOwnerMetric("kube_pod_owner", "default", "test-pod", "test-replicaset", "ReplicaSet", true)
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify owner-specific labels
+	expectedLabels := map[string]string{
+		"namespace":           "default",
+		"pod":                 "test-pod",
+		"owner_name":          "test-replicaset",
+		"owner_kind":          "ReplicaSet",
+		"owner_is_controller": "true",
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestPodPhaseMetrics(t *testing.T) {
+	// Test that all pod phases generate correct metrics
+	pod := &clustercache.Pod{
+		UID:       types.UID("phase-test-uid"),
+		Name:      "phase-test-pod",
+		Namespace: "default",
+		Labels:    map[string]string{},
+		Status: clustercache.PodStatus{
+			Phase: v1.PodRunning,
+		},
+		Spec: clustercache.PodSpec{},
+	}
+
+	mc := MetricsConfig{
+		DisabledMetrics: []string{"kube_pod_labels"}, // Only test phase metrics
+	}
+	kpc := KubePodCollector{
+		KubeClusterCache: NewFakePodCache([]*clustercache.Pod{pod}),
+		metricsConfig:    mc,
+	}
+
+	ch := make(chan prometheus.Metric, 10)
+	kpc.Collect(ch)
+	close(ch)
+
+	phaseMetrics := make(map[string]float64)
+	for metric := range ch {
+		var dtoMetric dto.Metric
+		metric.Write(&dtoMetric)
+
+		for _, label := range dtoMetric.Label {
+			if *label.Name == "phase" {
+				phaseMetrics[*label.Value] = *dtoMetric.Gauge.Value
+			}
+		}
+	}
+
+	// Verify all phases are emitted
+	expectedPhases := map[string]float64{
+		"Pending":   0.0,
+		"Succeeded": 0.0,
+		"Failed":    0.0,
+		"Unknown":   0.0,
+		"Running":   1.0, // Only Running should be 1
+	}
+
+	for phase, expectedValue := range expectedPhases {
+		if actualValue, ok := phaseMetrics[phase]; !ok {
+			t.Errorf("Missing phase metric for %s", phase)
+		} else if actualValue != expectedValue {
+			t.Errorf("Phase %s: expected value %f, got %f", phase, expectedValue, actualValue)
+		}
+	}
+}
+
+// FakePodCache implements ClusterCache interface for testing
+type FakePodCache struct {
+	clustercache.ClusterCache
+	pods []*clustercache.Pod
+}
+
+func (f FakePodCache) GetAllPods() []*clustercache.Pod {
+	return f.pods
+}
+
+func NewFakePodCache(pods []*clustercache.Pod) FakePodCache {
+	return FakePodCache{
+		pods: pods,
+	}
+}

+ 19 - 4
pkg/metrics/pvcmetrics.go

@@ -37,14 +37,15 @@ func (kpvc KubePVCCollector) Collect(ch chan<- prometheus.Metric) {
 	for _, pvc := range pvcs {
 		storageClass := getPersistentVolumeClaimClass(pvc)
 		volume := pvc.Spec.VolumeName
+		pvcUID := string(pvc.UID)
 
 		if _, disabled := disabledMetrics["kube_persistentvolumeclaim_info"]; !disabled {
-			ch <- newKubePVCInfoMetric("kube_persistentvolumeclaim_info", pvc.Name, pvc.Namespace, storageClass, volume)
+			ch <- newKubePVCInfoMetric("kube_persistentvolumeclaim_info", pvc.Name, pvc.Namespace, pvcUID, storageClass, volume)
 		}
 
 		if storage, ok := pvc.Spec.Resources.Requests[v1.ResourceStorage]; ok {
 			if _, disabled := disabledMetrics["kube_persistentvolumeclaim_resource_requests_storage_bytes"]; !disabled {
-				ch <- newKubePVCResourceRequestsStorageBytesMetric("kube_persistentvolumeclaim_resource_requests_storage_bytes", pvc.Name, pvc.Namespace, float64(storage.Value()))
+				ch <- newKubePVCResourceRequestsStorageBytesMetric("kube_persistentvolumeclaim_resource_requests_storage_bytes", pvc.Name, pvc.Namespace, pvcUID, float64(storage.Value()))
 			}
 		}
 	}
@@ -61,16 +62,18 @@ type KubePVCResourceRequestsStorageBytesMetric struct {
 	namespace string
 	pvc       string
 	value     float64
+	uid       string
 }
 
 // Creates a new KubePVCResourceRequestsStorageBytesMetric, implementation of prometheus.Metric
-func newKubePVCResourceRequestsStorageBytesMetric(fqname, pvc, namespace string, value float64) KubePVCResourceRequestsStorageBytesMetric {
+func newKubePVCResourceRequestsStorageBytesMetric(fqname, pvc, namespace, uid string, value float64) KubePVCResourceRequestsStorageBytesMetric {
 	return KubePVCResourceRequestsStorageBytesMetric{
 		fqName:    fqname,
 		help:      "kube_persistentvolumeclaim_resource_requests_storage_bytes pvc storage resource requests in bytes",
 		pvc:       pvc,
 		namespace: namespace,
 		value:     value,
+		uid:       uid,
 	}
 }
 
@@ -80,6 +83,7 @@ func (kpvcrr KubePVCResourceRequestsStorageBytesMetric) Desc() *prometheus.Desc
 	l := prometheus.Labels{
 		"persistentvolumeclaim": kpvcrr.pvc,
 		"namespace":             kpvcrr.namespace,
+		"uid":                   kpvcrr.uid,
 	}
 	return prometheus.NewDesc(kpvcrr.fqName, kpvcrr.help, []string{}, l)
 }
@@ -100,6 +104,10 @@ func (kpvcrr KubePVCResourceRequestsStorageBytesMetric) Write(m *dto.Metric) err
 			Name:  toStringPtr("namespace"),
 			Value: &kpvcrr.namespace,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpvcrr.uid,
+		},
 	}
 	return nil
 }
@@ -116,10 +124,11 @@ type KubePVCInfoMetric struct {
 	pvc          string
 	storageclass string
 	volume       string
+	uid          string
 }
 
 // Creates a new KubePVCInfoMetric, implementation of prometheus.Metric
-func newKubePVCInfoMetric(fqname, pvc, namespace, storageclass, volume string) KubePVCInfoMetric {
+func newKubePVCInfoMetric(fqname, pvc, namespace, uid, storageclass, volume string) KubePVCInfoMetric {
 	return KubePVCInfoMetric{
 		fqName:       fqname,
 		help:         "kube_persistentvolumeclaim_info pvc storage resource requests in bytes",
@@ -127,6 +136,7 @@ func newKubePVCInfoMetric(fqname, pvc, namespace, storageclass, volume string) K
 		namespace:    namespace,
 		storageclass: storageclass,
 		volume:       volume,
+		uid:          uid,
 	}
 }
 
@@ -138,6 +148,7 @@ func (kpvcrr KubePVCInfoMetric) Desc() *prometheus.Desc {
 		"namespace":             kpvcrr.namespace,
 		"storageclass":          kpvcrr.storageclass,
 		"volumename":            kpvcrr.volume,
+		"uid":                   kpvcrr.uid,
 	}
 	return prometheus.NewDesc(kpvcrr.fqName, kpvcrr.help, []string{}, l)
 }
@@ -167,6 +178,10 @@ func (kpvci KubePVCInfoMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("volumename"),
 			Value: &kpvci.volume,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpvci.uid,
+		},
 	}
 	return nil
 }

+ 130 - 0
pkg/metrics/pvcmetrics_test.go

@@ -0,0 +1,130 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+func collectMetrics(collector KubePVCCollector) []prometheus.Metric {
+	ch := make(chan prometheus.Metric, 10)
+	go func() {
+		defer close(ch)
+		collector.Collect(ch)
+	}()
+
+	var metrics []prometheus.Metric
+	for metric := range ch {
+		metrics = append(metrics, metric)
+	}
+	return metrics
+}
+
+
+func TestKubePVCCollector_Describe(t *testing.T) {
+	collector := KubePVCCollector{metricsConfig: MetricsConfig{}}
+	ch := make(chan *prometheus.Desc, 5)
+	go func() {
+		defer close(ch)
+		collector.Describe(ch)
+	}()
+
+	count := 0
+	for range ch {
+		count++
+	}
+
+	if count != 2 {
+		t.Errorf("Expected 2 metrics described, got %d", count)
+	}
+}
+
+
+func TestKubePVCCollector_Collect(t *testing.T) {
+	storageSize := resource.MustParse("1Gi")
+	pvc := &clustercache.PersistentVolumeClaim{
+		UID:       types.UID("test-uid"),
+		Name:      "test-pvc",
+		Namespace: "default",
+		Spec: v1.PersistentVolumeClaimSpec{
+			Resources: v1.VolumeResourceRequirements{
+				Requests: v1.ResourceList{v1.ResourceStorage: storageSize},
+			},
+		},
+	}
+
+	cache := NewFakePVCCache([]*clustercache.PersistentVolumeClaim{pvc})
+	collector := KubePVCCollector{
+		KubeClusterCache: cache,
+		metricsConfig:    MetricsConfig{},
+	}
+
+	metrics := collectMetrics(collector)
+	if len(metrics) != 2 {
+		t.Errorf("Expected 2 metrics, got %d", len(metrics))
+	}
+
+	// Verify UID label exists in metrics
+	for _, metric := range metrics {
+		var m dto.Metric
+		if err := metric.Write(&m); err != nil {
+			t.Errorf("Error writing metric: %v", err)
+		}
+
+		hasUID := false
+		for _, label := range m.Label {
+			if *label.Name == "uid" && *label.Value == "test-uid" {
+				hasUID = true
+				break
+			}
+		}
+		if !hasUID {
+			t.Error("Metric missing UID label")
+		}
+	}
+}
+
+
+func TestKubePVCMetrics_UIDLabel(t *testing.T) {
+	metric := newKubePVCResourceRequestsStorageBytesMetric(
+		"test_metric", "test-pvc", "test-namespace", "test-uid", 1000.0,
+	)
+
+	var m dto.Metric
+	if err := metric.Write(&m); err != nil {
+		t.Fatalf("Error writing metric: %v", err)
+	}
+
+	// Verify UID label exists
+	for _, label := range m.Label {
+		if *label.Name == "uid" && *label.Value == "test-uid" {
+			return
+		}
+	}
+	t.Error("UID label not found in metric")
+}
+
+
+
+
+
+
+type FakePVCCache struct {
+	clustercache.ClusterCache
+	pvcs []*clustercache.PersistentVolumeClaim
+}
+
+func (f FakePVCCache) GetAllPersistentVolumeClaims() []*clustercache.PersistentVolumeClaim {
+	return f.pvcs
+}
+
+func NewFakePVCCache(pvcs []*clustercache.PersistentVolumeClaim) FakePVCCache {
+	return FakePVCCache{
+		pvcs: pvcs,
+	}
+}

+ 30 - 7
pkg/metrics/pvmetrics.go

@@ -39,6 +39,8 @@ func (kpvcb KubePVCollector) Collect(ch chan<- prometheus.Metric) {
 	disabledMetrics := kpvcb.metricsConfig.GetDisabledMetricsMap()
 
 	for _, pv := range pvs {
+		pvUID := string(pv.UID)
+
 		if _, disabled := disabledMetrics["kube_persistentvolume_status_phase"]; !disabled {
 			phase := pv.Status.Phase
 			if phase != "" {
@@ -54,14 +56,14 @@ func (kpvcb KubePVCollector) Collect(ch chan<- prometheus.Metric) {
 				}
 
 				for _, p := range phases {
-					ch <- newKubePVStatusPhaseMetric("kube_persistentvolume_status_phase", pv.Name, p.n, boolFloat64(p.v))
+					ch <- newKubePVStatusPhaseMetric("kube_persistentvolume_status_phase", pv.Name, pvUID, p.n, boolFloat64(p.v))
 				}
 			}
 		}
 
 		if _, disabled := disabledMetrics["kube_persistentvolume_capacity_bytes"]; !disabled {
 			storage := pv.Spec.Capacity[v1.ResourceStorage]
-			m := newKubePVCapacityBytesMetric("kube_persistentvolume_capacity_bytes", pv.Name, float64(storage.Value()))
+			m := newKubePVCapacityBytesMetric("kube_persistentvolume_capacity_bytes", pv.Name, pvUID, float64(storage.Value()))
 			ch <- m
 		}
 
@@ -72,7 +74,7 @@ func (kpvcb KubePVCollector) Collect(ch chan<- prometheus.Metric) {
 			if pv.Spec.CSI != nil && pv.Spec.CSI.VolumeHandle != "" {
 				providerID = pv.Spec.CSI.VolumeHandle
 			}
-			m := newKubecostPVInfoMetric("kubecost_pv_info", pv.Name, storageClass, providerID, float64(1))
+			m := newKubecostPVInfoMetric("kubecost_pv_info", pv.Name, pvUID, storageClass, providerID, float64(1))
 			ch <- m
 		}
 	}
@@ -88,15 +90,17 @@ type KubePVCapacityBytesMetric struct {
 	help   string
 	pv     string
 	value  float64
+	uid    string
 }
 
 // Creates a new KubePVCapacityBytesMetric, implementation of prometheus.Metric
-func newKubePVCapacityBytesMetric(fqname, pv string, value float64) KubePVCapacityBytesMetric {
+func newKubePVCapacityBytesMetric(fqname, pv, uid string, value float64) KubePVCapacityBytesMetric {
 	return KubePVCapacityBytesMetric{
 		fqName: fqname,
 		help:   "kube_persistentvolume_capacity_bytes pv storage capacity in bytes",
 		pv:     pv,
 		value:  value,
+		uid:    uid,
 	}
 }
 
@@ -105,6 +109,7 @@ func newKubePVCapacityBytesMetric(fqname, pv string, value float64) KubePVCapaci
 func (kpcrr KubePVCapacityBytesMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"persistentvolume": kpcrr.pv,
+		"uid":              kpcrr.uid,
 	}
 	return prometheus.NewDesc(kpcrr.fqName, kpcrr.help, []string{}, l)
 }
@@ -121,6 +126,10 @@ func (kpcrr KubePVCapacityBytesMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("persistentvolume"),
 			Value: &kpcrr.pv,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpcrr.uid,
+		},
 	}
 	return nil
 }
@@ -136,16 +145,18 @@ type KubePVStatusPhaseMetric struct {
 	pv     string
 	phase  string
 	value  float64
+	uid    string
 }
 
-// Creates a new KubePVCapacityBytesMetric, implementation of prometheus.Metric
-func newKubePVStatusPhaseMetric(fqname, pv, phase string, value float64) KubePVStatusPhaseMetric {
+// Creates a new KubePVStatusPhaseMetric, implementation of prometheus.Metric
+func newKubePVStatusPhaseMetric(fqname, pv, uid, phase string, value float64) KubePVStatusPhaseMetric {
 	return KubePVStatusPhaseMetric{
 		fqName: fqname,
 		help:   "kube_persistentvolume_status_phase pv status phase",
 		pv:     pv,
 		phase:  phase,
 		value:  value,
+		uid:    uid,
 	}
 }
 
@@ -155,6 +166,7 @@ func (kpcrr KubePVStatusPhaseMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"persistentvolume": kpcrr.pv,
 		"phase":            kpcrr.phase,
+		"uid":              kpcrr.uid,
 	}
 	return prometheus.NewDesc(kpcrr.fqName, kpcrr.help, []string{}, l)
 }
@@ -175,6 +187,10 @@ func (kpcrr KubePVStatusPhaseMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("phase"),
 			Value: &kpcrr.phase,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpcrr.uid,
+		},
 	}
 	return nil
 }
@@ -192,10 +208,11 @@ type KubecostPVInfoMetric struct {
 	storageClass string
 	value        float64
 	providerId   string
+	uid          string
 }
 
 // Creates a new newKubecostPVInfoMetric, implementation of prometheus.Metric
-func newKubecostPVInfoMetric(fqname, pv, storageClass, providerID string, value float64) KubecostPVInfoMetric {
+func newKubecostPVInfoMetric(fqname, pv, uid, storageClass, providerID string, value float64) KubecostPVInfoMetric {
 	return KubecostPVInfoMetric{
 		fqName:       fqname,
 		help:         "kubecost_pv_info pv info",
@@ -203,6 +220,7 @@ func newKubecostPVInfoMetric(fqname, pv, storageClass, providerID string, value
 		storageClass: storageClass,
 		value:        value,
 		providerId:   providerID,
+		uid:          uid,
 	}
 }
 
@@ -213,6 +231,7 @@ func (kpvim KubecostPVInfoMetric) Desc() *prometheus.Desc {
 		"persistentvolume": kpvim.pv,
 		"storageclass":     kpvim.storageClass,
 		"provider_id":      kpvim.providerId,
+		"uid":              kpvim.uid,
 	}
 	return prometheus.NewDesc(kpvim.fqName, kpvim.help, []string{}, l)
 }
@@ -237,6 +256,10 @@ func (kpvim KubecostPVInfoMetric) Write(m *dto.Metric) error {
 			Name:  toStringPtr("provider_id"),
 			Value: &kpvim.providerId,
 		},
+		{
+			Name:  toStringPtr("uid"),
+			Value: &kpvim.uid,
+		},
 	}
 	return nil
 }

+ 124 - 0
pkg/metrics/pvmetrics_test.go

@@ -0,0 +1,124 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+func collectPVMetrics(collector KubePVCollector) []prometheus.Metric {
+	ch := make(chan prometheus.Metric, 10)
+	go func() {
+		defer close(ch)
+		collector.Collect(ch)
+	}()
+
+	var metrics []prometheus.Metric
+	for metric := range ch {
+		metrics = append(metrics, metric)
+	}
+	return metrics
+}
+
+func TestKubePVCollector_Describe(t *testing.T) {
+	collector := KubePVCollector{metricsConfig: MetricsConfig{}}
+	ch := make(chan *prometheus.Desc, 5)
+	go func() {
+		defer close(ch)
+		collector.Describe(ch)
+	}()
+
+	count := 0
+	for range ch {
+		count++
+	}
+
+	if count != 3 {
+		t.Errorf("Expected 3 metrics described, got %d", count)
+	}
+}
+
+func TestKubePVCollector_Collect(t *testing.T) {
+	storageSize := resource.MustParse("10Gi")
+	pv := &clustercache.PersistentVolume{
+		UID:  types.UID("test-pv-uid"),
+		Name: "test-pv",
+		Spec: v1.PersistentVolumeSpec{
+			Capacity: v1.ResourceList{
+				v1.ResourceStorage: storageSize,
+			},
+		},
+		Status: v1.PersistentVolumeStatus{
+			Phase: v1.VolumeBound,
+		},
+	}
+
+	cache := NewFakePVCache([]*clustercache.PersistentVolume{pv})
+	collector := KubePVCollector{
+		KubeClusterCache: cache,
+		metricsConfig:    MetricsConfig{},
+	}
+
+	metrics := collectPVMetrics(collector)
+	if len(metrics) != 7 { // 1 capacity + 5 phase + 1 info
+		t.Errorf("Expected 7 metrics, got %d", len(metrics))
+	}
+
+	// Verify UID label exists in metrics
+	for _, metric := range metrics {
+		var m dto.Metric
+		if err := metric.Write(&m); err != nil {
+			t.Errorf("Error writing metric: %v", err)
+		}
+
+		hasUID := false
+		for _, label := range m.Label {
+			if *label.Name == "uid" && *label.Value == "test-pv-uid" {
+				hasUID = true
+				break
+			}
+		}
+		if !hasUID {
+			t.Error("Metric missing UID label")
+		}
+	}
+}
+
+func TestKubePVMetrics_UIDLabel(t *testing.T) {
+	metric := newKubePVCapacityBytesMetric(
+		"test_metric", "test-pv", "test-uid", 1000.0,
+	)
+
+	var m dto.Metric
+	if err := metric.Write(&m); err != nil {
+		t.Fatalf("Error writing metric: %v", err)
+	}
+
+	// Verify UID label exists
+	for _, label := range m.Label {
+		if *label.Name == "uid" && *label.Value == "test-uid" {
+			return
+		}
+	}
+	t.Error("UID label not found in metric")
+}
+
+type FakePVCache struct {
+	clustercache.ClusterCache
+	pvs []*clustercache.PersistentVolume
+}
+
+func (f FakePVCache) GetAllPersistentVolumes() []*clustercache.PersistentVolume {
+	return f.pvs
+}
+
+func NewFakePVCache(pvs []*clustercache.PersistentVolume) FakePVCache {
+	return FakePVCache{
+		pvs: pvs,
+	}
+}

+ 10 - 2
pkg/metrics/servicemetrics.go

@@ -41,10 +41,11 @@ func (sc KubecostServiceCollector) Collect(ch chan<- prometheus.Metric) {
 	for _, svc := range svcs {
 		serviceName := svc.Name
 		serviceNS := svc.Namespace
+		serviceUID := string(svc.UID)
 
 		labels, values := promutil.KubeLabelsToLabels(promutil.SanitizeLabels(svc.SpecSelector))
 		if len(labels) > 0 {
-			m := newServiceSelectorLabelsMetric(serviceName, serviceNS, "service_selector_labels", labels, values)
+			m := newServiceSelectorLabelsMetric(serviceName, serviceNS, "service_selector_labels", labels, values, serviceUID)
 			ch <- m
 		}
 	}
@@ -63,10 +64,11 @@ type ServiceSelectorLabelsMetric struct {
 	labelValues []string
 	serviceName string
 	namespace   string
+	uid         string
 }
 
 // Creates a new ServiceMetric, implementation of prometheus.Metric
-func newServiceSelectorLabelsMetric(name, namespace, fqname string, labelNames, labelvalues []string) ServiceSelectorLabelsMetric {
+func newServiceSelectorLabelsMetric(name, namespace, fqname string, labelNames, labelvalues []string, uid string) ServiceSelectorLabelsMetric {
 	return ServiceSelectorLabelsMetric{
 		fqName:      fqname,
 		labelNames:  labelNames,
@@ -74,6 +76,7 @@ func newServiceSelectorLabelsMetric(name, namespace, fqname string, labelNames,
 		help:        "service_selector_labels Service Selector Labels",
 		serviceName: name,
 		namespace:   namespace,
+		uid:         uid,
 	}
 }
 
@@ -83,6 +86,7 @@ func (s ServiceSelectorLabelsMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"service":   s.serviceName,
 		"namespace": s.namespace,
+		"uid":       s.uid,
 	}
 	return prometheus.NewDesc(s.fqName, s.help, s.labelNames, l)
 }
@@ -109,6 +113,10 @@ func (s ServiceSelectorLabelsMetric) Write(m *dto.Metric) error {
 		Name:  toStringPtr("service"),
 		Value: &s.serviceName,
 	})
+	labels = append(labels, &dto.LabelPair{
+		Name:  toStringPtr("uid"),
+		Value: &s.uid,
+	})
 	m.Label = labels
 	return nil
 }

+ 233 - 0
pkg/metrics/servicemetrics_test.go

@@ -0,0 +1,233 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+func TestKubecostServiceCollector_Describe(t *testing.T) {
+	tests := []struct {
+		name            string
+		disabledMetrics []string
+		expectMetric    bool
+	}{
+		{
+			name:            "service_selector_labels enabled",
+			disabledMetrics: []string{},
+			expectMetric:    true,
+		},
+		{
+			name:            "service_selector_labels disabled",
+			disabledMetrics: []string{"service_selector_labels"},
+			expectMetric:    false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			sc := KubecostServiceCollector{
+				KubeClusterCache: NewFakeServiceCache([]*clustercache.Service{}),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan *prometheus.Desc, 10)
+			sc.Describe(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if tt.expectMetric && count == 0 {
+				t.Error("Expected metric description but got none")
+			}
+			if !tt.expectMetric && count > 0 {
+				t.Error("Expected no metric description but got some")
+			}
+		})
+	}
+}
+
+func TestKubecostServiceCollector_Collect(t *testing.T) {
+	tests := []struct {
+		name            string
+		services        []*clustercache.Service
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name: "single service with selector",
+			services: []*clustercache.Service{
+				{
+					UID:          types.UID("test-uid-1"),
+					Name:         "test-service",
+					Namespace:    "default",
+					SpecSelector: map[string]string{"app": "test", "version": "v1"},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   1,
+		},
+		{
+			name: "service without selector",
+			services: []*clustercache.Service{
+				{
+					UID:          types.UID("test-uid-2"),
+					Name:         "headless-service",
+					Namespace:    "default",
+					SpecSelector: map[string]string{},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   0,
+		},
+		{
+			name: "multiple services with selectors",
+			services: []*clustercache.Service{
+				{
+					UID:          types.UID("test-uid-3"),
+					Name:         "service1",
+					Namespace:    "ns1",
+					SpecSelector: map[string]string{"app": "app1"},
+				},
+				{
+					UID:          types.UID("test-uid-4"),
+					Name:         "service2",
+					Namespace:    "ns2",
+					SpecSelector: map[string]string{"component": "frontend"},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   2,
+		},
+		{
+			name: "metric disabled",
+			services: []*clustercache.Service{
+				{
+					UID:          types.UID("test-uid-5"),
+					Name:         "test-service",
+					Namespace:    "default",
+					SpecSelector: map[string]string{"app": "test"},
+				},
+			},
+			disabledMetrics: []string{"service_selector_labels"},
+			expectedCount:   0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			sc := KubecostServiceCollector{
+				KubeClusterCache: NewFakeServiceCache(tt.services),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan prometheus.Metric, 10)
+			sc.Collect(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestServiceSelectorLabelsMetric(t *testing.T) {
+	labelNames := []string{"app", "version"}
+	labelValues := []string{"test-app", "v1.0"}
+	uid := "test-uid"
+
+	metric := newServiceSelectorLabelsMetric("test-service", "default", "service_selector_labels", labelNames, labelValues, uid)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"app":       "test-app",
+		"version":   "v1.0",
+		"service":   "test-service",
+		"namespace": "default",
+		"uid":       uid,
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestServiceSelectorLabelsMetric_EmptyLabels(t *testing.T) {
+	metric := newServiceSelectorLabelsMetric("empty-service", "test-ns", "service_selector_labels", []string{}, []string{}, "empty-uid")
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	// Should still have the service metadata labels
+	expectedCount := 3 // service, namespace, uid
+	if len(dtoMetric.Label) != expectedCount {
+		t.Errorf("Expected %d labels, got %d", expectedCount, len(dtoMetric.Label))
+	}
+}
+
+// FakeServiceCache implements ClusterCache interface for testing
+type FakeServiceCache struct {
+	clustercache.ClusterCache
+	services []*clustercache.Service
+}
+
+func (f FakeServiceCache) GetAllServices() []*clustercache.Service {
+	return f.services
+}
+
+func NewFakeServiceCache(services []*clustercache.Service) FakeServiceCache {
+	return FakeServiceCache{
+		services: services,
+	}
+}

+ 15 - 5
pkg/metrics/statefulsetmetrics.go

@@ -40,11 +40,14 @@ func (sc KubecostStatefulsetCollector) Collect(ch chan<- prometheus.Metric) {
 	for _, statefulset := range ds {
 		statefulsetName := statefulset.Name
 		statefulsetNS := statefulset.Namespace
+		statefulsetUID := string(statefulset.UID)
 
-		labels, values := promutil.KubeLabelsToLabels(promutil.SanitizeLabels(statefulset.SpecSelector.MatchLabels))
-		if len(labels) > 0 {
-			m := newStatefulsetMatchLabelsMetric(statefulsetName, statefulsetNS, "statefulSet_match_labels", labels, values)
-			ch <- m
+		if statefulset.SpecSelector != nil {
+			labels, values := promutil.KubeLabelsToLabels(promutil.SanitizeLabels(statefulset.SpecSelector.MatchLabels))
+			if len(labels) > 0 {
+				m := newStatefulsetMatchLabelsMetric(statefulsetName, statefulsetNS, "statefulSet_match_labels", labels, values, statefulsetUID)
+				ch <- m
+			}
 		}
 	}
 
@@ -62,10 +65,11 @@ type StatefulsetMatchLabelsMetric struct {
 	labelValues     []string
 	statefulsetName string
 	namespace       string
+	uid             string
 }
 
 // Creates a new StatefulsetMetric, implementation of prometheus.Metric
-func newStatefulsetMatchLabelsMetric(name, namespace, fqname string, labelNames, labelvalues []string) StatefulsetMatchLabelsMetric {
+func newStatefulsetMatchLabelsMetric(name, namespace, fqname string, labelNames, labelvalues []string, uid string) StatefulsetMatchLabelsMetric {
 	return StatefulsetMatchLabelsMetric{
 		fqName:          fqname,
 		labelNames:      labelNames,
@@ -73,6 +77,7 @@ func newStatefulsetMatchLabelsMetric(name, namespace, fqname string, labelNames,
 		help:            "statefulSet_match_labels StatefulSet Match Labels",
 		statefulsetName: name,
 		namespace:       namespace,
+		uid:             uid,
 	}
 }
 
@@ -82,6 +87,7 @@ func (s StatefulsetMatchLabelsMetric) Desc() *prometheus.Desc {
 	l := prometheus.Labels{
 		"statefulSet": s.statefulsetName,
 		"namespace":   s.namespace,
+		"uid":         s.uid,
 	}
 	return prometheus.NewDesc(s.fqName, s.help, s.labelNames, l)
 }
@@ -108,6 +114,10 @@ func (s StatefulsetMatchLabelsMetric) Write(m *dto.Metric) error {
 		Name:  toStringPtr("statefulSet"),
 		Value: &s.statefulsetName,
 	})
+	labels = append(labels, &dto.LabelPair{
+		Name:  toStringPtr("uid"),
+		Value: &s.uid,
+	})
 	m.Label = labels
 	return nil
 }

+ 302 - 0
pkg/metrics/statefulsetmetrics_test.go

@@ -0,0 +1,302 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/opencost/opencost/core/pkg/clustercache"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+func TestKubecostStatefulsetCollector_Describe(t *testing.T) {
+	tests := []struct {
+		name            string
+		disabledMetrics []string
+		expectMetric    bool
+	}{
+		{
+			name:            "statefulSet_match_labels enabled",
+			disabledMetrics: []string{},
+			expectMetric:    true,
+		},
+		{
+			name:            "statefulSet_match_labels disabled",
+			disabledMetrics: []string{"statefulSet_match_labels"},
+			expectMetric:    false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			sc := KubecostStatefulsetCollector{
+				KubeClusterCache: NewFakeStatefulsetCache([]*clustercache.StatefulSet{}),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan *prometheus.Desc, 10)
+			sc.Describe(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if tt.expectMetric && count == 0 {
+				t.Error("Expected metric description but got none")
+			}
+			if !tt.expectMetric && count > 0 {
+				t.Error("Expected no metric description but got some")
+			}
+		})
+	}
+}
+
+func TestKubecostStatefulsetCollector_Collect(t *testing.T) {
+	tests := []struct {
+		name            string
+		statefulsets    []*clustercache.StatefulSet
+		disabledMetrics []string
+		expectedCount   int
+	}{
+		{
+			name: "single statefulset with match labels",
+			statefulsets: []*clustercache.StatefulSet{
+				{
+					UID:       types.UID("test-uid-1"),
+					Name:      "test-statefulset",
+					Namespace: "default",
+					SpecSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{"app": "test", "version": "v1"},
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   1,
+		},
+		{
+			name: "statefulset without match labels",
+			statefulsets: []*clustercache.StatefulSet{
+				{
+					UID:       types.UID("test-uid-2"),
+					Name:      "empty-statefulset",
+					Namespace: "default",
+					SpecSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{},
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   0,
+		},
+		{
+			name: "statefulset with nil selector",
+			statefulsets: []*clustercache.StatefulSet{
+				{
+					UID:          types.UID("test-uid-3"),
+					Name:         "nil-selector-statefulset",
+					Namespace:    "default",
+					SpecSelector: nil,
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   0,
+		},
+		{
+			name: "multiple statefulsets with match labels",
+			statefulsets: []*clustercache.StatefulSet{
+				{
+					UID:       types.UID("test-uid-4"),
+					Name:      "statefulset1",
+					Namespace: "ns1",
+					SpecSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{"app": "app1"},
+					},
+				},
+				{
+					UID:       types.UID("test-uid-5"),
+					Name:      "statefulset2",
+					Namespace: "ns2",
+					SpecSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{"component": "database"},
+					},
+				},
+			},
+			disabledMetrics: []string{},
+			expectedCount:   2,
+		},
+		{
+			name: "metric disabled",
+			statefulsets: []*clustercache.StatefulSet{
+				{
+					UID:       types.UID("test-uid-6"),
+					Name:      "test-statefulset",
+					Namespace: "default",
+					SpecSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{"app": "test"},
+					},
+				},
+			},
+			disabledMetrics: []string{"statefulSet_match_labels"},
+			expectedCount:   0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mc := MetricsConfig{
+				DisabledMetrics: tt.disabledMetrics,
+			}
+			sc := KubecostStatefulsetCollector{
+				KubeClusterCache: NewFakeStatefulsetCache(tt.statefulsets),
+				metricsConfig:    mc,
+			}
+
+			ch := make(chan prometheus.Metric, 10)
+			sc.Collect(ch)
+			close(ch)
+
+			count := 0
+			for range ch {
+				count++
+			}
+
+			if count != tt.expectedCount {
+				t.Errorf("Expected %d metrics, got %d", tt.expectedCount, count)
+			}
+		})
+	}
+}
+
+func TestStatefulsetMatchLabelsMetric(t *testing.T) {
+	labelNames := []string{"app", "version"}
+	labelValues := []string{"test-app", "v1.0"}
+	uid := "test-uid"
+
+	metric := newStatefulsetMatchLabelsMetric("test-statefulset", "default", "statefulSet_match_labels", labelNames, labelValues, uid)
+
+	// Test Desc method
+	desc := metric.Desc()
+	if desc == nil {
+		t.Error("Expected non-nil descriptor")
+	}
+
+	// Test Write method
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	if dtoMetric.Gauge == nil {
+		t.Error("Expected gauge metric")
+	}
+
+	if *dtoMetric.Gauge.Value != 1.0 {
+		t.Errorf("Expected gauge value 1.0, got %f", *dtoMetric.Gauge.Value)
+	}
+
+	// Verify labels
+	expectedLabels := map[string]string{
+		"app":         "test-app",
+		"version":     "v1.0",
+		"statefulSet": "test-statefulset",
+		"namespace":   "default",
+		"uid":         uid,
+	}
+
+	actualLabels := make(map[string]string)
+	for _, label := range dtoMetric.Label {
+		actualLabels[*label.Name] = *label.Value
+	}
+
+	for key, expectedValue := range expectedLabels {
+		if actualValue, ok := actualLabels[key]; !ok {
+			t.Errorf("Missing label %s", key)
+		} else if actualValue != expectedValue {
+			t.Errorf("Label %s: expected %s, got %s", key, expectedValue, actualValue)
+		}
+	}
+}
+
+func TestStatefulsetMatchLabelsMetric_EmptyLabels(t *testing.T) {
+	metric := newStatefulsetMatchLabelsMetric("empty-statefulset", "test-ns", "statefulSet_match_labels", []string{}, []string{}, "empty-uid")
+
+	var dtoMetric dto.Metric
+	err := metric.Write(&dtoMetric)
+	if err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+
+	// Should still have the statefulset metadata labels
+	expectedCount := 3 // statefulSet, namespace, uid
+	if len(dtoMetric.Label) != expectedCount {
+		t.Errorf("Expected %d labels, got %d", expectedCount, len(dtoMetric.Label))
+	}
+}
+
+func TestStatefulsetMatchLabelsMetric_MissingFields(t *testing.T) {
+	tests := []struct {
+		name            string
+		statefulsetName string
+		namespace       string
+		uid             string
+	}{
+		{
+			name:            "empty statefulset name",
+			statefulsetName: "",
+			namespace:       "test-ns",
+			uid:             "test-uid",
+		},
+		{
+			name:            "empty namespace",
+			statefulsetName: "test-statefulset",
+			namespace:       "",
+			uid:             "test-uid",
+		},
+		{
+			name:            "empty uid",
+			statefulsetName: "test-statefulset",
+			namespace:       "test-ns",
+			uid:             "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			metric := newStatefulsetMatchLabelsMetric(tt.statefulsetName, tt.namespace, "statefulSet_match_labels", []string{}, []string{}, tt.uid)
+
+			var dtoMetric dto.Metric
+			err := metric.Write(&dtoMetric)
+			if err != nil {
+				t.Errorf("Expected no error, got %v", err)
+			}
+
+			// Should still create the metric with empty values
+			if len(dtoMetric.Label) != 3 {
+				t.Errorf("Expected 3 labels, got %d", len(dtoMetric.Label))
+			}
+		})
+	}
+}
+
+// FakeStatefulsetCache implements ClusterCache interface for testing
+type FakeStatefulsetCache struct {
+	clustercache.ClusterCache
+	statefulsets []*clustercache.StatefulSet
+}
+
+func (f FakeStatefulsetCache) GetAllStatefulSets() []*clustercache.StatefulSet {
+	return f.statefulsets
+}
+
+func NewFakeStatefulsetCache(statefulsets []*clustercache.StatefulSet) FakeStatefulsetCache {
+	return FakeStatefulsetCache{
+		statefulsets: statefulsets,
+	}
+}