فهرست منبع

V2.3 patch/local disk updates (#2865)

* Fix local storage queries

Signed-off-by: Sean Holcomb <seanholcomb@gmail.com>

* Create matchable azure local disk provider id's (#2842)

Signed-off-by: Sean Holcomb <seanholcomb@gmail.com>

---------

Signed-off-by: Sean Holcomb <seanholcomb@gmail.com>
Sean Holcomb 1 سال پیش
والد
کامیت
7e634c6f5d
3فایلهای تغییر یافته به همراه182 افزوده شده و 65 حذف شده
  1. 32 0
      pkg/cloud/provider/provider.go
  2. 44 0
      pkg/cloud/provider/provider_test.go
  3. 106 65
      pkg/costmodel/cluster.go

+ 32 - 0
pkg/cloud/provider/provider.go

@@ -2,9 +2,11 @@ package provider
 
 import (
 	"errors"
+	"fmt"
 	"net"
 	"net/http"
 	"regexp"
+	"strconv"
 	"strings"
 	"time"
 
@@ -316,6 +318,7 @@ var (
 	// gce://guestbook-227502/us-central1-a/gke-niko-n1-standard-2-wljla-8df8e58a-hfy7
 	//  => gke-niko-n1-standard-2-wljla-8df8e58a-hfy7
 	providerGCERegex = regexp.MustCompile("gce://[^/]*/[^/]*/([^/]+)")
+
 	// Capture "vol-0fc54c5e83b8d2b76" from "aws://us-east-2a/vol-0fc54c5e83b8d2b76"
 	persistentVolumeAWSRegex = regexp.MustCompile("aws:/[^/]*/[^/]*/([^/]+)")
 	// Capture "ad9d88195b52a47c89b5055120f28c58" from "ad9d88195b52a47c89b5055120f28c58-1037804914.us-east-2.elb.amazonaws.com"
@@ -362,3 +365,32 @@ func ParseLBID(id string) string {
 	// Return id for GCP Provider, Azure Provider, CSV Provider and Custom Provider
 	return id
 }
+
+// ParseLocalDiskID attempts to parse a ProviderID from the ProviderID of the node that the local disk is running on
+func ParseLocalDiskID(id string) string {
+	// Parse like node
+	id = ParseID(id)
+
+	if strings.HasPrefix(id, "azure://") {
+
+		// handle vmss ProviderID of type azure:///subscriptions/ae337b64-e7ba-3387-b043-187289efe4e3/resourceGroups/mc_test_eastus2/providers/Microsoft.Compute/virtualMachineScaleSets/aks-userpool-12345678-vmss/virtualMachines/11
+		if strings.Contains(id, "virtualMachineScaleSets") {
+			split := strings.Split(id, "/virtualMachineScaleSets/")
+			// combine vmss name and number into a single string ending in a 6 character base 32 number
+			vmSplit := strings.Split(split[1], "/")
+			if len(vmSplit) != 3 {
+				return id
+			}
+			vmNum, err := strconv.ParseInt(vmSplit[2], 10, 64)
+			if err != nil {
+				return id
+			}
+
+			id = fmt.Sprintf("%s/disks/%s%06s", split[0], vmSplit[0], strconv.FormatInt(vmNum, 32))
+		}
+		id = strings.Replace(id, "/virtualMachines/", "/disks/", -1)
+		id = strings.ToLower(id)
+		return fmt.Sprintf("%s_osdisk", id)
+	}
+	return id
+}

+ 44 - 0
pkg/cloud/provider/provider_test.go

@@ -0,0 +1,44 @@
+package provider
+
+import (
+	"testing"
+)
+
+func TestParseLocalDiskID(t *testing.T) {
+	tests := map[string]struct {
+		input string
+		want  string
+	}{
+		"empty string": {
+			input: "",
+			want:  "",
+		},
+		"generic string": {
+			input: "test",
+			want:  "test",
+		},
+		"AWS node provider id": {
+			input: "aws:///us-east-2a/i-0fea4fd46592d050b",
+			want:  "i-0fea4fd46592d050b",
+		},
+		"GCP node provider id": {
+			input: "gce://guestbook-11111/us-central1-a/gke-niko-n1-standard-2-wlkla-8d48e58a-hfy7",
+			want:  "gke-niko-n1-standard-2-wlkla-8d48e58a-hfy7",
+		},
+		"Azure vmss provider id": {
+			input: "azure:///subscriptions/ae337b64-e7ba-3387-b043-187289efe4e3/resourceGroups/mc_test_eastus2/providers/Microsoft.Compute/virtualMachineScaleSets/aks-userpool-12345678-vmss/virtualMachines/11",
+			want:  "azure:///subscriptions/ae337b64-e7ba-3387-b043-187289efe4e3/resourcegroups/mc_test_eastus2/providers/microsoft.compute/disks/aks-userpool-12345678-vmss00000b_osdisk",
+		},
+		"Azure vm provider id": {
+			input: "azure:///subscriptions/ae337b64-e7ba-3387-b043-187289efe4e3/resourceGroups/mc_test_eastus2/providers/Microsoft.Compute/virtualMachines/master-0",
+			want:  "azure:///subscriptions/ae337b64-e7ba-3387-b043-187289efe4e3/resourcegroups/mc_test_eastus2/providers/microsoft.compute/disks/master-0_osdisk",
+		},
+	}
+	for name, tt := range tests {
+		t.Run(name, func(t *testing.T) {
+			if got := ParseLocalDiskID(tt.input); got != tt.want {
+				t.Errorf("ParseLocalDiskID() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}

+ 106 - 65
pkg/costmodel/cluster.go

@@ -42,7 +42,7 @@ const (
 	queryNodes = `sum(avg(node_total_hourly_cost{%s}) by (node, %s)) * 730 %s`
 )
 
-const maxLocalDiskSize = 200 // AWS limits root disks to 100 Gi, and occasional metric errors in filesystem size should not contribute to large costs.
+const MAX_LOCAL_STORAGE_SIZE = 1024 * 1024 * 1024 * 1024
 
 // Costs represents cumulative and monthly cluster costs over a given duration. Costs
 // are broken down by cores, memory, and storage.
@@ -142,7 +142,7 @@ type DiskIdentifier struct {
 	Name    string
 }
 
-func ClusterDisks(client prometheus.Client, provider models.Provider, start, end time.Time) (map[DiskIdentifier]*Disk, error) {
+func ClusterDisks(client prometheus.Client, cp models.Provider, start, end time.Time) (map[DiskIdentifier]*Disk, error) {
 	// Start from the time "end", querying backwards
 	t := end
 
@@ -277,9 +277,18 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
 		diskMap[key].ClaimNamespace = claimNamespace
 	}
 
-	pvCosts(diskMap, resolution, resActiveMins, resPVSize, resPVCost, resPVUsedAvg, resPVUsedMax, resPVCInfo, provider, opencost.NewClosedWindow(start, end))
+	pvCosts(diskMap, resolution, resActiveMins, resPVSize, resPVCost, resPVUsedAvg, resPVUsedMax, resPVCInfo, cp, opencost.NewClosedWindow(start, end))
 
-	for _, result := range resLocalStorageCost {
+	type localStorage struct {
+		device string
+		disk   *Disk
+	}
+
+	localStorageDisks := map[DiskIdentifier]localStorage{}
+
+	// Start with local storage bytes so that the device with the largest size which has passed the
+	// query filters can be determined
+	for _, result := range resLocalStorageBytes {
 		cluster, err := result.GetString(env.GetPromClusterLabel())
 		if err != nil {
 			cluster = env.GetClusterID()
@@ -291,23 +300,37 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
 			continue
 		}
 
-		cost := result.Values[0].Value
+		device, err := result.GetString("device")
+		if err != nil {
+			log.Warnf("ClusterDisks: local storage data missing device")
+			continue
+		}
+
+		bytes := result.Values[0].Value
+		// Ignore disks that are larger than the max size
+		if bytes > MAX_LOCAL_STORAGE_SIZE {
+			continue
+		}
+
 		key := DiskIdentifier{cluster, name}
-		if _, ok := diskMap[key]; !ok {
-			diskMap[key] = &Disk{
-				Cluster:   cluster,
-				Name:      name,
-				Breakdown: &ClusterCostsBreakdown{},
-				Local:     true,
+
+		// only keep the device with the most bytes per instance
+		if current, ok := localStorageDisks[key]; !ok || current.disk.Bytes < bytes {
+			localStorageDisks[key] = localStorage{
+				device: device,
+				disk: &Disk{
+					Cluster:      cluster,
+					Name:         name,
+					Breakdown:    &ClusterCostsBreakdown{},
+					Local:        true,
+					StorageClass: opencost.LocalStorageClass,
+					Bytes:        bytes,
+				},
 			}
 		}
-		diskMap[key].Cost += cost
-
-		//Assigning explicitly the storage class of local storage to local
-		diskMap[key].StorageClass = opencost.LocalStorageClass
 	}
 
-	for _, result := range resLocalStorageUsedCost {
+	for _, result := range resLocalStorageCost {
 		cluster, err := result.GetString(env.GetPromClusterLabel())
 		if err != nil {
 			cluster = env.GetClusterID()
@@ -315,24 +338,27 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
 
 		name, err := result.GetString("instance")
 		if err != nil {
-			log.Warnf("ClusterDisks: local storage usage data missing instance")
+			log.Warnf("ClusterDisks: local storage data missing instance")
+			continue
+		}
+
+		device, err := result.GetString("device")
+		if err != nil {
+			log.Warnf("ClusterDisks: local storage data missing device")
 			continue
 		}
 
 		cost := result.Values[0].Value
 		key := DiskIdentifier{cluster, name}
-		if _, ok := diskMap[key]; !ok {
-			diskMap[key] = &Disk{
-				Cluster:   cluster,
-				Name:      name,
-				Breakdown: &ClusterCostsBreakdown{},
-				Local:     true,
-			}
+		ls, ok := localStorageDisks[key]
+		if !ok || ls.device != device {
+			continue
 		}
-		diskMap[key].Breakdown.System = cost / diskMap[key].Cost
+		ls.disk.Cost = cost
+
 	}
 
-	for _, result := range resLocalStorageUsedAvg {
+	for _, result := range resLocalStorageUsedCost {
 		cluster, err := result.GetString(env.GetPromClusterLabel())
 		if err != nil {
 			cluster = env.GetClusterID()
@@ -340,24 +366,26 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
 
 		name, err := result.GetString("instance")
 		if err != nil {
-			log.Warnf("ClusterDisks: local storage data missing instance")
+			log.Warnf("ClusterDisks: local storage usage data missing instance")
 			continue
 		}
 
-		bytesAvg := result.Values[0].Value
+		device, err := result.GetString("device")
+		if err != nil {
+			log.Warnf("ClusterDisks: local storage data missing device")
+			continue
+		}
+
+		cost := result.Values[0].Value
 		key := DiskIdentifier{cluster, name}
-		if _, ok := diskMap[key]; !ok {
-			diskMap[key] = &Disk{
-				Cluster:   cluster,
-				Name:      name,
-				Breakdown: &ClusterCostsBreakdown{},
-				Local:     true,
-			}
+		ls, ok := localStorageDisks[key]
+		if !ok || ls.device != device {
+			continue
 		}
-		diskMap[key].BytesUsedAvgPtr = &bytesAvg
+		ls.disk.Breakdown.System = cost / ls.disk.Cost
 	}
 
-	for _, result := range resLocalStorageUsedMax {
+	for _, result := range resLocalStorageUsedAvg {
 		cluster, err := result.GetString(env.GetPromClusterLabel())
 		if err != nil {
 			cluster = env.GetClusterID()
@@ -369,20 +397,22 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
 			continue
 		}
 
-		bytesMax := result.Values[0].Value
+		device, err := result.GetString("device")
+		if err != nil {
+			log.Warnf("ClusterDisks: local storage data missing device")
+			continue
+		}
+
+		bytesAvg := result.Values[0].Value
 		key := DiskIdentifier{cluster, name}
-		if _, ok := diskMap[key]; !ok {
-			diskMap[key] = &Disk{
-				Cluster:   cluster,
-				Name:      name,
-				Breakdown: &ClusterCostsBreakdown{},
-				Local:     true,
-			}
+		ls, ok := localStorageDisks[key]
+		if !ok || ls.device != device {
+			continue
 		}
-		diskMap[key].BytesUsedMaxPtr = &bytesMax
+		ls.disk.BytesUsedAvgPtr = &bytesAvg
 	}
 
-	for _, result := range resLocalStorageBytes {
+	for _, result := range resLocalStorageUsedMax {
 		cluster, err := result.GetString(env.GetPromClusterLabel())
 		if err != nil {
 			cluster = env.GetClusterID()
@@ -394,21 +424,19 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
 			continue
 		}
 
-		bytes := result.Values[0].Value
-		key := DiskIdentifier{cluster, name}
-		if _, ok := diskMap[key]; !ok {
-			diskMap[key] = &Disk{
-				Cluster:   cluster,
-				Name:      name,
-				Breakdown: &ClusterCostsBreakdown{},
-				Local:     true,
-			}
+		device, err := result.GetString("device")
+		if err != nil {
+			log.Warnf("ClusterDisks: local storage data missing device")
+			continue
 		}
-		diskMap[key].Bytes = bytes
-		if bytes/1024/1024/1024 > maxLocalDiskSize {
-			log.DedupedWarningf(5, "Deleting large root disk/localstorage disk from analysis")
-			delete(diskMap, key)
+
+		bytesMax := result.Values[0].Value
+		key := DiskIdentifier{cluster, name}
+		ls, ok := localStorageDisks[key]
+		if !ok || ls.device != device {
+			continue
 		}
+		ls.disk.BytesUsedMaxPtr = &bytesMax
 	}
 
 	for _, result := range resLocalActiveMins {
@@ -423,12 +451,20 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
 			continue
 		}
 
+		providerID, err := result.GetString("provider_id")
+		if err != nil {
+			log.DedupedWarningf(5, "ClusterDisks: local active mins data missing instance")
+			continue
+		}
+
 		key := DiskIdentifier{cluster, name}
-		if _, ok := diskMap[key]; !ok {
-			log.DedupedWarningf(5, "ClusterDisks: local active mins for unidentified disk or disk deleted from analysis")
+		ls, ok := localStorageDisks[key]
+		if !ok {
 			continue
 		}
 
+		ls.disk.ProviderID = provider.ParseLocalDiskID(providerID)
+
 		if len(result.Values) == 0 {
 			continue
 		}
@@ -439,9 +475,14 @@ func ClusterDisks(client prometheus.Client, provider models.Provider, start, end
 
 		// TODO niko/assets if mins >= threshold, interpolate for missing data?
 
-		diskMap[key].End = e
-		diskMap[key].Start = s
-		diskMap[key].Minutes = mins
+		ls.disk.End = e
+		ls.disk.Start = s
+		ls.disk.Minutes = mins
+	}
+
+	// move local storage disks to main disk map
+	for key, ls := range localStorageDisks {
+		diskMap[key] = ls.disk
 	}
 
 	var unTracedDiskLogData []DiskIdentifier