Selaa lähdekoodia

Merge branch 'develop' of https://github.com/kubecost/cost-model into develop

Ajay Tripathy 4 vuotta sitten
vanhempi
sitoutus
e22c67533d

+ 34 - 12
cmd/kubemetrics/main.go

@@ -9,6 +9,7 @@ import (
 
 	"github.com/kubecost/cost-model/pkg/cloud"
 	"github.com/kubecost/cost-model/pkg/clustercache"
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/costmodel"
 	"github.com/kubecost/cost-model/pkg/costmodel/clusters"
 	"github.com/kubecost/cost-model/pkg/env"
@@ -27,6 +28,13 @@ import (
 	"k8s.io/klog"
 )
 
+// ClusterExportInterval is the interval used to export the cluster if env.IsExportClusterCacheEnabled() is true
+const ClusterExportInterval = 5 * time.Minute
+
+// clusterExporter is used if env.IsExportClusterCacheEnabled() is set to true
+// it will export the kubernetes cluster data to a file on a specific interval
+var clusterExporter *clustercache.ClusterExporter
+
 func Healthz(w http.ResponseWriter, _ *http.Request) {
 	w.WriteHeader(200)
 	w.Header().Set("Content-Length", "0")
@@ -34,7 +42,7 @@ func Healthz(w http.ResponseWriter, _ *http.Request) {
 }
 
 // initializes the kubernetes client cache
-func newKubernetesClusterCache() (clustercache.ClusterCache, error) {
+func newKubernetesClusterCache() (kubernetes.Interface, clustercache.ClusterCache, error) {
 	var err error
 
 	// Kubernetes API setup
@@ -46,19 +54,19 @@ func newKubernetesClusterCache() (clustercache.ClusterCache, error) {
 	}
 
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
 	kubeClientset, err := kubernetes.NewForConfig(kc)
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
 	// Create Kubernetes Cluster Cache + Watchers
 	k8sCache := clustercache.NewKubernetesClusterCache(kubeClientset)
 	k8sCache.Run()
 
-	return k8sCache, nil
+	return kubeClientset, k8sCache, nil
 }
 
 func newPrometheusClient() (prometheus.Client, error) {
@@ -124,13 +132,19 @@ func main() {
 	klog.Infof("Using scrape interval of %f", scrapeInterval.Seconds())
 
 	// initialize kubernetes client and cluster cache
-	clusterCache, err := newKubernetesClusterCache()
+	k8sClient, clusterCache, err := newKubernetesClusterCache()
 	if err != nil {
 		panic(err.Error())
 	}
 
+	// Create ConfigFileManager for synchronization of shared configuration
+	confManager := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		BucketStoreConfig: env.GetKubecostConfigBucket(),
+		LocalConfigPath:   "/",
+	})
+
 	cloudProviderKey := env.GetCloudProviderAPIKey()
-	cloudProvider, err := cloud.NewProvider(clusterCache, cloudProviderKey)
+	cloudProvider, err := cloud.NewProvider(clusterCache, cloudProviderKey, confManager)
 	if err != nil {
 		panic(err.Error())
 	}
@@ -140,7 +154,6 @@ func main() {
 	watchConfigFunc := configWatchers.ToWatchFunc()
 	watchedConfigs := configWatchers.GetWatchedConfigs()
 
-	k8sClient := clusterCache.GetClient()
 	kubecostNamespace := env.GetKubecostNamespace()
 
 	// We need an initial invocation because the init of the cache has happened before we had access to the provider.
@@ -155,16 +168,25 @@ func main() {
 
 	clusterCache.SetConfigMapUpdateFunc(watchConfigFunc)
 
+	// Initialize cluster exporting if it's enabled
+	if env.IsExportClusterCacheEnabled() {
+		cacheLocation := confManager.ConfigFileAt("/var/configs/cluster-cache.json")
+		clusterExporter = clustercache.NewClusterExporter(clusterCache, cacheLocation, ClusterExportInterval)
+		clusterExporter.Run()
+	}
+
+	// ClusterInfo Provider to provide the cluster map with local and remote cluster data
+	clusterInfoConf := confManager.ConfigFileAt("/var/configs/cluster-info.json")
+	localClusterInfo := costmodel.NewLocalClusterInfoProvider(k8sClient, cloudProvider)
+	clusterInfoProvider := costmodel.NewClusterInfoWriteOnRequest(localClusterInfo, clusterInfoConf)
+
 	// Initialize ClusterMap for maintaining ClusterInfo by ClusterID
-	clusterMap := clusters.NewClusterMap(
-		promCli,
-		costmodel.NewLocalClusterInfoProvider(k8sClient, cloudProvider),
-		5*time.Minute)
+	clusterMap := clusters.NewClusterMap(promCli, clusterInfoProvider, 5*time.Minute)
 
 	costModel := costmodel.NewCostModel(promCli, cloudProvider, clusterCache, clusterMap, scrapeInterval)
 
 	// initialize Kubernetes Metrics Emitter
-	metricsEmitter := costmodel.NewCostModelMetricsEmitter(promCli, clusterCache, cloudProvider, costModel)
+	metricsEmitter := costmodel.NewCostModelMetricsEmitter(promCli, clusterCache, cloudProvider, clusterInfoProvider, costModel)
 
 	// download pricing data
 	err = cloudProvider.DownloadPricingData()

+ 3 - 1
go.mod

@@ -15,13 +15,15 @@ require (
 	github.com/aws/aws-sdk-go-v2 v1.9.0
 	github.com/davecgh/go-spew v1.1.1
 	github.com/getsentry/sentry-go v0.6.1
-	github.com/google/uuid v1.1.2
+	github.com/google/uuid v1.3.0
 	github.com/json-iterator/go v1.1.10
 	github.com/jszwec/csvutil v1.2.1
 	github.com/julienschmidt/httprouter v1.3.0
 	github.com/lib/pq v1.2.0
 	github.com/microcosm-cc/bluemonday v1.0.5
+	github.com/minio/minio-go/v7 v7.0.15
 	github.com/patrickmn/go-cache v2.1.0+incompatible
+	github.com/pkg/errors v0.9.1
 	github.com/prometheus/client_golang v1.0.0
 	github.com/prometheus/client_model v0.2.0
 	github.com/rs/cors v1.7.0

+ 27 - 1
go.sum

@@ -111,6 +111,7 @@ github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi
 github.com/dimchansky/utfbom v1.1.1/go.mod h1:SxdoEBH5qIqFocHMyGOXVAybYJdr71b1Q/j0mACtrfE=
 github.com/docker/spdystream v0.0.0-20160310174837-449fdfce4d96/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM=
 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/eknkc/amber v0.0.0-20171010120322-cdade1c07385/go.mod h1:0vRUJqYpeSZifjYj7uP3BG/gKcuzL9xWVV/Y+cK33KM=
 github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc=
@@ -207,13 +208,15 @@ github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hf
 github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
 github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
 github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
+github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
 github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM=
 github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
 github.com/googleapis/gnostic v0.4.1 h1:DLJCy1n/vrD4HPjOvYcT8aYQXpPIzoRZONaYwyycI+I=
 github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3ir6b65WBswg=
+github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
 github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
 github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY=
 github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c=
@@ -247,6 +250,7 @@ github.com/jstemmer/go-junit-report v0.9.1 h1:6QPYqodiu3GuPL+7mfx+NwDdp2eTkp9IfE
 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
 github.com/jszwec/csvutil v1.2.1 h1:9+vmGqMdYxIbeDmVbTrVryibx2izwHAfKdPwl4GPNHM=
 github.com/jszwec/csvutil v1.2.1/go.mod h1:8YHz6C3KVdIeCxLMvwbbIVDCTA/Wi2df93AZlQNaE2U=
+github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
 github.com/juju/errors v0.0.0-20181118221551-089d3ea4e4d5/go.mod h1:W54LbzXuIE0boCoNJfwqpmkKJ1O4TCTZMetAt6jGk7Q=
 github.com/juju/loggo v0.0.0-20180524022052-584905176618/go.mod h1:vgyd7OREkbtVEN/8IXZe5Ooef3LQePvuBm9UWj6ZL8U=
@@ -263,7 +267,12 @@ github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQL
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.8.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.9.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
+github.com/klauspost/compress v1.13.5 h1:9O69jUPDcsT9fEm74W92rZL9FQY7rCdaXVneq+yyzl4=
+github.com/klauspost/compress v1.13.5/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
 github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
+github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
+github.com/klauspost/cpuid v1.3.1 h1:5JNjFYYQrZeKRJ0734q51WCEEn2huer72Dc7K+R/b6s=
+github.com/klauspost/cpuid v1.3.1/go.mod h1:bYW4mA6ZgKPob1/Dlai2LviZJO7KGI3uoWLd42rAQw4=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
@@ -293,6 +302,12 @@ github.com/mediocregopher/radix/v3 v3.3.0/go.mod h1:EmfVyvspXz1uZEyPBMyGK+kjWiKQ
 github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/leAFZyRl6bYmGDlGc=
 github.com/microcosm-cc/bluemonday v1.0.5 h1:cF59UCKMmmUgqN1baLvqU/B1ZsMori+duLVTLpgiG3w=
 github.com/microcosm-cc/bluemonday v1.0.5/go.mod h1:8iwZnFn2CDDNZ0r6UXhF4xawGvzaqzCRa1n3/lO3W2w=
+github.com/minio/md5-simd v1.1.0 h1:QPfiOqlZH+Cj9teu0t9b1nTBfPbyTl16Of5MeuShdK4=
+github.com/minio/md5-simd v1.1.0/go.mod h1:XpBqgZULrMYD3R+M28PcmP0CkI7PEMzB3U77ZrKZ0Gw=
+github.com/minio/minio-go/v7 v7.0.15 h1:r9/NhjJ+nXYrIYvbObhvc1wPj3YH1iDpJzz61uRKLyY=
+github.com/minio/minio-go/v7 v7.0.15/go.mod h1:pUV0Pc+hPd1nccgmzQF/EXh48l/Z/yps6QPF1aaie4g=
+github.com/minio/sha256-simd v0.1.1 h1:5QHSlgo3nt5yKOJrC7W8w7X+NFl8cMPZm96iu8kKUJU=
+github.com/minio/sha256-simd v0.1.1/go.mod h1:B5e1o+1/KgNmWrSQK08Y6Z1Vb5pwIktudl0J58iy0KM=
 github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
 github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
@@ -346,6 +361,8 @@ github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsT
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
 github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik=
 github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU=
+github.com/rs/xid v1.2.1 h1:mhH9Nq+C1fY2l1XIpgxIiUOfNpRBYH1kKcr+qfKgjRc=
+github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ=
 github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
 github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
 github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww=
@@ -355,7 +372,11 @@ github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24 h1:pntxY8Ary0t4
 github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24/go.mod h1:M+9NzErvs504Cn4c5DxATwIqPbtswREoFCre64PpcG4=
 github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
+github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
+github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
+github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
+github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
 github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
 github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
 github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk=
@@ -409,6 +430,7 @@ golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8U
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20201216223049-8b5274cf687f/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
 golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad h1:DN0cp81fZ3njFcrLCytUHRSUkqBjfTo4Tx9RJTWs0EY=
 golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -467,6 +489,7 @@ golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20201110031124-69a78807bb2b h1:uwuIcX0g4Yl1NC5XAz37xsr2lTtcqevgzYNVt49waME=
 golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
@@ -513,6 +536,7 @@ golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200828194041-157a740278f4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201112073958-5cba982894dd h1:5CtCZbICpIOFdgO940moixOPjc0178IU44m4EjOO5IY=
@@ -639,6 +663,8 @@ gopkg.in/go-playground/assert.v1 v1.2.1/go.mod h1:9RXL0bg/zibRAgZUYszZSwO/z8Y/a8
 gopkg.in/go-playground/validator.v8 v8.18.2/go.mod h1:RX2a/7Ha8BgOhfk7j780h4/u/RRjR0eouCJSH80/M2Y=
 gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
 gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
+gopkg.in/ini.v1 v1.57.0 h1:9unxIsFcTt4I55uWluz+UmL95q4kdJ0buvQ1ZIqVQww=
+gopkg.in/ini.v1 v1.57.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/mgo.v2 v2.0.0-20180705113604-9856a29383ce/go.mod h1:yeKp02qBN3iKW1OzL3MGk2IdtZzaj7SFntXj72NppTA=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

+ 1 - 1
pkg/cloud/awsprovider.go

@@ -2020,7 +2020,7 @@ func (a *AWS) ExternalAllocations(start string, end string, aggregators []string
 		}
 
 		if customPricing.BillingDataDataset != "" && !crossCluster { // There is GCP data, meaning someone has tried to configure a GCP out-of-cluster allocation.
-			gcp, err := NewCrossClusterProvider("gcp", "aws.json", a.Clientset)
+			gcp, err := NewCrossClusterProvider("gcp", a.Config.ConfigFileManager(), "aws.json", a.Clientset)
 			if err != nil {
 				klog.Infof("Could not instantiate cross-cluster provider %s", err.Error())
 			}

+ 1 - 1
pkg/cloud/gcpprovider.go

@@ -352,7 +352,7 @@ func (gcp *GCP) ExternalAllocations(start string, end string, aggregators []stri
 
 	var s []*OutOfClusterAllocation
 	if c.ServiceKeyName != "" && c.ServiceKeySecret != "" && !crossCluster {
-		aws, err := NewCrossClusterProvider("aws", "gcp.json", gcp.Clientset)
+		aws, err := NewCrossClusterProvider("aws", gcp.Config.ConfigFileManager(), "gcp.json", gcp.Clientset)
 		if err != nil {
 			klog.Infof("Could not instantiate cross-cluster provider %s", err.Error())
 		}

+ 19 - 17
pkg/cloud/provider.go

@@ -4,18 +4,20 @@ import (
 	"database/sql"
 	"errors"
 	"fmt"
-	"github.com/kubecost/cost-model/pkg/util"
 	"io"
 	"regexp"
 	"strconv"
 	"strings"
 	"time"
 
+	"github.com/kubecost/cost-model/pkg/util"
+
 	"k8s.io/klog"
 
 	"cloud.google.com/go/compute/metadata"
 
 	"github.com/kubecost/cost-model/pkg/clustercache"
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/util/watcher"
@@ -383,31 +385,31 @@ func ShareTenancyCosts(p Provider) bool {
 	return config.ShareTenancyCosts == "true"
 }
 
-func NewCrossClusterProvider(ctype string, overrideConfigPath string, cache clustercache.ClusterCache) (Provider, error) {
+func NewCrossClusterProvider(ctype string, config *config.ConfigFileManager, overrideConfigPath string, cache clustercache.ClusterCache) (Provider, error) {
 	if ctype == "aws" {
 		return &AWS{
 			Clientset: cache,
-			Config:    NewProviderConfig(overrideConfigPath),
+			Config:    NewProviderConfig(config, overrideConfigPath),
 		}, nil
 	} else if ctype == "gcp" {
 		return &GCP{
 			Clientset: cache,
-			Config:    NewProviderConfig(overrideConfigPath),
+			Config:    NewProviderConfig(config, overrideConfigPath),
 		}, nil
 	} else if ctype == "azure" {
 		return &Azure{
 			Clientset: cache,
-			Config:    NewProviderConfig(overrideConfigPath),
+			Config:    NewProviderConfig(config, overrideConfigPath),
 		}, nil
 	}
 	return &CustomProvider{
 		Clientset: cache,
-		Config:    NewProviderConfig(overrideConfigPath),
+		Config:    NewProviderConfig(config, overrideConfigPath),
 	}, nil
 }
 
 // NewProvider looks at the nodespec or provider metadata server to decide which provider to instantiate.
-func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, error) {
+func NewProvider(cache clustercache.ClusterCache, apiKey string, config *config.ConfigFileManager) (Provider, error) {
 	nodes := cache.GetAllNodes()
 	if len(nodes) == 0 {
 		return nil, fmt.Errorf("Could not locate any nodes for cluster.")
@@ -422,7 +424,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 			CSVLocation: env.GetCSVPath(),
 			CustomProvider: &CustomProvider{
 				Clientset: cache,
-				Config:    NewProviderConfig(cp.configFileName),
+				Config:    NewProviderConfig(config, cp.configFileName),
 			},
 		}, nil
 	case "GCP":
@@ -433,7 +435,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 		return &GCP{
 			Clientset:        cache,
 			APIKey:           apiKey,
-			Config:           NewProviderConfig(cp.configFileName),
+			Config:           NewProviderConfig(config, cp.configFileName),
 			clusterRegion:    cp.region,
 			clusterProjectId: cp.projectID,
 		}, nil
@@ -441,7 +443,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 		klog.V(2).Info("Found ProviderID starting with \"aws\", using AWS Provider")
 		return &AWS{
 			Clientset:        cache,
-			Config:           NewProviderConfig(cp.configFileName),
+			Config:           NewProviderConfig(config, cp.configFileName),
 			clusterRegion:    cp.region,
 			clusterAccountId: cp.accountID,
 		}, nil
@@ -449,7 +451,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 		klog.V(2).Info("Found ProviderID starting with \"azure\", using Azure Provider")
 		return &Azure{
 			Clientset:        cache,
-			Config:           NewProviderConfig(cp.configFileName),
+			Config:           NewProviderConfig(config, cp.configFileName),
 			clusterRegion:    cp.region,
 			clusterAccountId: cp.accountID,
 		}, nil
@@ -457,7 +459,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 		klog.V(2).Info("Unsupported provider, falling back to default")
 		return &CustomProvider{
 			Clientset: cache,
-			Config:    NewProviderConfig(cp.configFileName),
+			Config:    NewProviderConfig(config, cp.configFileName),
 		}, nil
 	}
 }
@@ -470,15 +472,15 @@ type clusterProperties struct {
 	projectID      string
 }
 
-func getClusterProperties(node *v1.Node) (clusterProperties) {
+func getClusterProperties(node *v1.Node) clusterProperties {
 	providerID := strings.ToLower(node.Spec.ProviderID)
 	region, _ := util.GetRegion(node.Labels)
 	cp := clusterProperties{
-		provider: "DEFAULT",
+		provider:       "DEFAULT",
 		configFileName: "default.json",
-		region: region,
-		accountID: "",
-		projectID: "",
+		region:         region,
+		accountID:      "",
+		projectID:      "",
 	}
 	if metadata.OnGCE() {
 		cp.provider = "GCP"

+ 69 - 34
pkg/cloud/providerconfig.go

@@ -2,14 +2,15 @@ package cloud
 
 import (
 	"fmt"
-	"io/ioutil"
+	gopath "path"
 	"reflect"
 	"strconv"
 	"strings"
 	"sync"
 
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/env"
-	"github.com/kubecost/cost-model/pkg/util/fileutil"
+	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/util/json"
 	"github.com/microcosm-cc/bluemonday"
 
@@ -18,23 +19,61 @@ import (
 
 var sanitizePolicy = bluemonday.UGCPolicy()
 
-// ProviderConfig is a utility class that provides a thread-safe configuration
-// storage/cache for all Provider implementations
+// ProviderConfig is a utility class that provides a thread-safe configuration storage/cache for all Provider
+// implementations
 type ProviderConfig struct {
-	lock          *sync.Mutex
-	fileName      string
-	configPath    string
-	customPricing *CustomPricing
+	lock            *sync.Mutex
+	configManager   *config.ConfigFileManager
+	configFile      *config.ConfigFile
+	customPricing   *CustomPricing
+	watcherHandleID config.HandlerID
 }
 
-// Creates a new ProviderConfig instance
-func NewProviderConfig(file string) *ProviderConfig {
-	return &ProviderConfig{
+// NewProviderConfig creates a new ConfigFile and returns the ProviderConfig
+func NewProviderConfig(configManager *config.ConfigFileManager, fileName string) *ProviderConfig {
+	configFile := configManager.ConfigFileAt(configPathFor(fileName))
+	pc := &ProviderConfig{
 		lock:          new(sync.Mutex),
-		fileName:      file,
-		configPath:    configPathFor(file),
+		configManager: configManager,
+		configFile:    configFile,
 		customPricing: nil,
 	}
+
+	// add the provider config func as handler for the config file changes
+	pc.watcherHandleID = configFile.AddChangeHandler(pc.onConfigFileUpdated)
+	return pc
+}
+
+// onConfigFileUpdated handles any time the config file contents are updated, created, or deleted
+func (pc *ProviderConfig) onConfigFileUpdated(changeType config.ChangeType, data []byte) {
+	// TODO: (bolt) Currently this has the side-effect of setting pc.customPricing twice when the update
+	// TODO: (bolt) is made from this ProviderConfig instance. We'll need to implement a way of identifying
+	// TODO: (bolt) when to ignore updates when the change and handler are the same source
+	log.Infof("CustomPricing Config Updated: %s", changeType)
+
+	switch changeType {
+	case config.ChangeTypeCreated:
+		fallthrough
+	case config.ChangeTypeModified:
+		pc.lock.Lock()
+		defer pc.lock.Unlock()
+
+		customPricing := new(CustomPricing)
+		err := json.Unmarshal(data, customPricing)
+		if err != nil {
+			klog.Infof("Could not decode Custom Pricing file at path %s. Using default.", pc.configFile.Path())
+			customPricing = DefaultPricing()
+		}
+
+		pc.customPricing = customPricing
+		if pc.customPricing.SpotGPU == "" {
+			pc.customPricing.SpotGPU = DefaultPricing().SpotGPU // Migration for users without this value set by default.
+		}
+
+		if pc.customPricing.ShareTenancyCosts == "" {
+			pc.customPricing.ShareTenancyCosts = defaultShareTenancyCost
+		}
+	}
 }
 
 // Non-ThreadSafe logic to load the config file if a cache does not exist. Flag to write
@@ -44,16 +83,16 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*CustomPricing, err
 		return pc.customPricing, nil
 	}
 
-	exists, err := fileExists(pc.configPath)
+	exists, err := pc.configFile.Exists()
 	// File Error other than NotExists
 	if err != nil {
-		klog.Infof("Custom Pricing file at path '%s' read error: '%s'", pc.configPath, err.Error())
+		klog.Infof("Custom Pricing file at path '%s' read error: '%s'", pc.configFile.Path(), err.Error())
 		return DefaultPricing(), err
 	}
 
 	// File Doesn't Exist
 	if !exists {
-		klog.Infof("Could not find Custom Pricing file at path '%s'", pc.configPath)
+		klog.Infof("Could not find Custom Pricing file at path '%s'", pc.configFile.Path())
 		pc.customPricing = DefaultPricing()
 
 		// Only write the file if flag enabled
@@ -63,9 +102,9 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*CustomPricing, err
 				return pc.customPricing, err
 			}
 
-			err = ioutil.WriteFile(pc.configPath, cj, 0644)
+			err = pc.configFile.Write(cj)
 			if err != nil {
-				klog.Infof("Could not write Custom Pricing file to path '%s'", pc.configPath)
+				klog.Infof("Could not write Custom Pricing file to path '%s'", pc.configFile.Path())
 				return pc.customPricing, err
 			}
 		}
@@ -74,9 +113,9 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*CustomPricing, err
 	}
 
 	// File Exists - Read all contents of file, unmarshal json
-	byteValue, err := ioutil.ReadFile(pc.configPath)
+	byteValue, err := pc.configFile.Read()
 	if err != nil {
-		klog.Infof("Could not read Custom Pricing file at path %s", pc.configPath)
+		klog.Infof("Could not read Custom Pricing file at path %s", pc.configFile.Path())
 		// If read fails, we don't want to cache default, assuming that the file is valid
 		return DefaultPricing(), err
 	}
@@ -84,7 +123,7 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*CustomPricing, err
 	var customPricing CustomPricing
 	err = json.Unmarshal(byteValue, &customPricing)
 	if err != nil {
-		klog.Infof("Could not decode Custom Pricing file at path %s", pc.configPath)
+		klog.Infof("Could not decode Custom Pricing file at path %s", pc.configFile.Path())
 		return DefaultPricing(), err
 	}
 
@@ -108,6 +147,13 @@ func (pc *ProviderConfig) GetCustomPricingData() (*CustomPricing, error) {
 	return pc.loadConfig(true)
 }
 
+// ConfigFileManager returns the ConfigFileManager instance used to manage the CustomPricing
+// configuration. In the event of a multi-provider setup, this instance should be used to
+// configure any other configuration providers.
+func (pc *ProviderConfig) ConfigFileManager() *config.ConfigFileManager {
+	return pc.configManager
+}
+
 // Allows a call to manually update the configuration while maintaining proper thread-safety
 // for read/write methods.
 func (pc *ProviderConfig) Update(updateFunc func(*CustomPricing) error) (*CustomPricing, error) {
@@ -132,7 +178,7 @@ func (pc *ProviderConfig) Update(updateFunc func(*CustomPricing) error) (*Custom
 	if err != nil {
 		return c, err
 	}
-	err = ioutil.WriteFile(pc.configPath, cj, 0644)
+	err = pc.configFile.Write(cj)
 
 	if err != nil {
 		return c, err
@@ -210,19 +256,8 @@ func SetCustomPricingField(obj *CustomPricing, name string, value string) error
 	return nil
 }
 
-// File exists has three different return cases that should be handled:
-//   1. File exists and is not a directory (true, nil)
-//   2. File does not exist (false, nil)
-//   3. File may or may not exist. Error occurred during stat (false, error)
-// The third case represents the scenario where the stat returns an error,
-// but the error isn't relevant to the path. This can happen when the current
-// user doesn't have permission to access the file.
-func fileExists(filename string) (bool, error) {
-	return fileutil.FileExists(filename) // delegate to utility method
-}
-
 // Returns the configuration directory concatenated with a specific config file name
 func configPathFor(filename string) string {
 	path := env.GetConfigPathWithDefault("/models/")
-	return path + filename
+	return gopath.Join(path, filename)
 }

+ 20 - 10
pkg/clustercache/clustercache.go

@@ -10,6 +10,7 @@ import (
 	autoscaling "k8s.io/api/autoscaling/v2beta1"
 	batchv1 "k8s.io/api/batch/v1"
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/api/policy/v1beta1"
 	stv1 "k8s.io/api/storage/v1"
 	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/client-go/kubernetes"
@@ -24,10 +25,6 @@ type ClusterCache interface {
 	// Stops the watcher processes
 	Stop()
 
-	// Gets the underlying clientset
-	// TODO: Remove once we support all cached cluster components
-	GetClient() kubernetes.Interface
-
 	// GetAllNamespaces returns all the cached namespaces
 	GetAllNamespaces() []*v1.Namespace
 
@@ -64,9 +61,12 @@ type ClusterCache interface {
 	// GetAllJobs returns all the cached jobs
 	GetAllJobs() []*batchv1.Job
 
-	// GetAllHorizontalPodAutoscalers() returns all cached horizontal pod autoscalers
+	// GetAllHorizontalPodAutoscalers returns all cached horizontal pod autoscalers
 	GetAllHorizontalPodAutoscalers() []*autoscaling.HorizontalPodAutoscaler
 
+	// GetAllPodDisruptionBudgets returns all cached pod disruption budgets
+	GetAllPodDisruptionBudgets() []*v1beta1.PodDisruptionBudget
+
 	// SetConfigMapUpdateFunc sets the configmap update function
 	SetConfigMapUpdateFunc(func(interface{}))
 }
@@ -89,6 +89,7 @@ type KubernetesClusterCache struct {
 	storageClassWatch      WatchController
 	jobsWatch              WatchController
 	hpaWatch               WatchController
+	pdbWatch               WatchController
 	stop                   chan struct{}
 }
 
@@ -103,6 +104,7 @@ func NewKubernetesClusterCache(client kubernetes.Interface) ClusterCache {
 	storageRestClient := client.StorageV1().RESTClient()
 	batchClient := client.BatchV1().RESTClient()
 	autoscalingClient := client.AutoscalingV2beta1().RESTClient()
+	pdbClient := client.PolicyV1beta1().RESTClient()
 
 	kubecostNamespace := env.GetKubecostNamespace()
 	klog.Infof("NAMESPACE: %s", kubecostNamespace)
@@ -123,11 +125,12 @@ func NewKubernetesClusterCache(client kubernetes.Interface) ClusterCache {
 		storageClassWatch:      NewCachingWatcher(storageRestClient, "storageclasses", &stv1.StorageClass{}, "", fields.Everything()),
 		jobsWatch:              NewCachingWatcher(batchClient, "jobs", &batchv1.Job{}, "", fields.Everything()),
 		hpaWatch:               NewCachingWatcher(autoscalingClient, "horizontalpodautoscalers", &autoscaling.HorizontalPodAutoscaler{}, "", fields.Everything()),
+		pdbWatch:               NewCachingWatcher(pdbClient, "poddisruptionbudgets", &v1beta1.PodDisruptionBudget{}, "", fields.Everything()),
 	}
 
 	// Wait for each caching watcher to initialize
 	var wg sync.WaitGroup
-	wg.Add(14)
+	wg.Add(15)
 
 	cancel := make(chan struct{})
 
@@ -145,6 +148,7 @@ func NewKubernetesClusterCache(client kubernetes.Interface) ClusterCache {
 	go initializeCache(kcc.storageClassWatch, &wg, cancel)
 	go initializeCache(kcc.jobsWatch, &wg, cancel)
 	go initializeCache(kcc.hpaWatch, &wg, cancel)
+	go initializeCache(kcc.podWatch, &wg, cancel)
 
 	wg.Wait()
 
@@ -171,6 +175,7 @@ func (kcc *KubernetesClusterCache) Run() {
 	go kcc.storageClassWatch.Run(1, stopCh)
 	go kcc.jobsWatch.Run(1, stopCh)
 	go kcc.hpaWatch.Run(1, stopCh)
+	go kcc.pdbWatch.Run(1, stopCh)
 
 	kcc.stop = stopCh
 }
@@ -184,10 +189,6 @@ func (kcc *KubernetesClusterCache) Stop() {
 	kcc.stop = nil
 }
 
-func (kcc *KubernetesClusterCache) GetClient() kubernetes.Interface {
-	return kcc.client
-}
-
 func (kcc *KubernetesClusterCache) GetAllNamespaces() []*v1.Namespace {
 	var namespaces []*v1.Namespace
 	items := kcc.namespaceWatch.GetAll()
@@ -305,6 +306,15 @@ func (kcc *KubernetesClusterCache) GetAllHorizontalPodAutoscalers() []*autoscali
 	return hpas
 }
 
+func (kcc *KubernetesClusterCache) GetAllPodDisruptionBudgets() []*v1beta1.PodDisruptionBudget {
+	var pdbs []*v1beta1.PodDisruptionBudget
+	items := kcc.pdbWatch.GetAll()
+	for _, pdb := range items {
+		pdbs = append(pdbs, pdb.(*v1beta1.PodDisruptionBudget))
+	}
+	return pdbs
+}
+
 func (kcc *KubernetesClusterCache) SetConfigMapUpdateFunc(f func(interface{})) {
 	kcc.kubecostConfigMapWatch.SetUpdateHandler(f)
 }

+ 114 - 0
pkg/clustercache/clusterexporter.go

@@ -0,0 +1,114 @@
+package clustercache
+
+import (
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/config"
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/util/atomic"
+	"github.com/kubecost/cost-model/pkg/util/json"
+
+	appsv1 "k8s.io/api/apps/v1"
+	autoscaling "k8s.io/api/autoscaling/v2beta1"
+	batchv1 "k8s.io/api/batch/v1"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/api/policy/v1beta1"
+	stv1 "k8s.io/api/storage/v1"
+)
+
+// clusterEncoding is used to represent the cluster objects in the encoded states.
+type clusterEncoding struct {
+	Namespaces               []*v1.Namespace                        `json:"namespaces,omitempty"`
+	Nodes                    []*v1.Node                             `json:"nodes,omitempty"`
+	Pods                     []*v1.Pod                              `json:"pods,omitempty"`
+	Services                 []*v1.Service                          `json:"services,omitempty"`
+	DaemonSets               []*appsv1.DaemonSet                    `json:"daemonSets,omitempty"`
+	Deployments              []*appsv1.Deployment                   `json:"deployments,omitempty"`
+	StatefulSets             []*appsv1.StatefulSet                  `json:"statefulSets,omitempty"`
+	ReplicaSets              []*appsv1.ReplicaSet                   `json:"replicaSets,omitempty"`
+	PersistentVolumes        []*v1.PersistentVolume                 `json:"persistentVolumes,omitempty"`
+	PersistentVolumeClaims   []*v1.PersistentVolumeClaim            `json:"persistentVolumeClaims,omitempty"`
+	StorageClasses           []*stv1.StorageClass                   `json:"storageClasses,omitempty"`
+	Jobs                     []*batchv1.Job                         `json:"jobs,omitempty"`
+	HorizontalPodAutoscalers []*autoscaling.HorizontalPodAutoscaler `json:"horizontalPodAutoscalers,omitempty"`
+	PodDisruptionBudgets     []*v1beta1.PodDisruptionBudget         `json:"podDisruptionBudgets,omitEmpty"`
+}
+
+// ClusterExporter manages and runs an file export process which dumps the local kubernetes cluster to a target location.
+type ClusterExporter struct {
+	cluster  ClusterCache
+	target   *config.ConfigFile
+	interval time.Duration
+	runState atomic.AtomicRunState
+}
+
+// NewClusterExporter creates a new ClusterExporter instance for exporting the kubernetes cluster.
+func NewClusterExporter(cluster ClusterCache, target *config.ConfigFile, interval time.Duration) *ClusterExporter {
+	return &ClusterExporter{
+		cluster:  cluster,
+		target:   target,
+		interval: interval,
+	}
+}
+
+// Run starts the automated process of running Export on a specific interval.
+func (ce *ClusterExporter) Run() {
+	// in the event there is a race that occurs between Run() and Stop(), we
+	// ensure that we wait for the reset to occur before starting again
+	ce.runState.WaitForReset()
+
+	if !ce.runState.Start() {
+		log.Warningf("ClusterExporter already running")
+		return
+	}
+
+	go func() {
+		for {
+			err := ce.Export()
+			if err != nil {
+				log.Warningf("Failed to export cluster: %s", err)
+			}
+
+			select {
+			case <-time.After(ce.interval):
+			case <-ce.runState.OnStop():
+				ce.runState.Reset()
+				return
+			}
+		}
+	}()
+}
+
+// Stop halts the Cluster export on an interval
+func (ce *ClusterExporter) Stop() {
+	ce.runState.Stop()
+}
+
+// Export stores the cluster cache data into a PODO, marshals as JSON, and saves it to the
+// target location.
+func (ce *ClusterExporter) Export() error {
+	c := ce.cluster
+	encoding := &clusterEncoding{
+		Namespaces:               c.GetAllNamespaces(),
+		Nodes:                    c.GetAllNodes(),
+		Pods:                     c.GetAllPods(),
+		Services:                 c.GetAllServices(),
+		DaemonSets:               c.GetAllDaemonSets(),
+		Deployments:              c.GetAllDeployments(),
+		StatefulSets:             c.GetAllStatefulSets(),
+		ReplicaSets:              c.GetAllReplicaSets(),
+		PersistentVolumes:        c.GetAllPersistentVolumes(),
+		PersistentVolumeClaims:   c.GetAllPersistentVolumeClaims(),
+		StorageClasses:           c.GetAllStorageClasses(),
+		Jobs:                     c.GetAllJobs(),
+		HorizontalPodAutoscalers: c.GetAllHorizontalPodAutoscalers(),
+		PodDisruptionBudgets:     c.GetAllPodDisruptionBudgets(),
+	}
+
+	data, err := json.Marshal(encoding)
+	if err != nil {
+		return err
+	}
+
+	return ce.target.Write(data)
+}

+ 309 - 0
pkg/clustercache/clusterimporter.go

@@ -0,0 +1,309 @@
+package clustercache
+
+import (
+	"sync"
+
+	"github.com/kubecost/cost-model/pkg/config"
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/util/json"
+	appsv1 "k8s.io/api/apps/v1"
+	autoscaling "k8s.io/api/autoscaling/v2beta1"
+	batchv1 "k8s.io/api/batch/v1"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/api/policy/v1beta1"
+	stv1 "k8s.io/api/storage/v1"
+)
+
+// ClusterImporter is an implementation of ClusterCache which leverages a backing configuration file
+// as it's source of the cluster data.
+type ClusterImporter struct {
+	source          *config.ConfigFile
+	sourceHandlerID config.HandlerID
+	dataLock        *sync.Mutex
+	data            *clusterEncoding
+}
+
+// Creates a new ClusterCache implementation which uses an import process to provide cluster data
+func NewClusterImporter(source *config.ConfigFile) ClusterCache {
+	return &ClusterImporter{
+		source:   source,
+		dataLock: new(sync.Mutex),
+		data:     new(clusterEncoding),
+	}
+}
+
+// onImportSourceChanged handles the source data updating
+func (ci *ClusterImporter) onImportSourceChanged(changeType config.ChangeType, data []byte) {
+	if changeType == config.ChangeTypeDeleted {
+		ci.dataLock.Lock()
+		ci.data = new(clusterEncoding)
+		ci.dataLock.Unlock()
+		return
+	}
+
+	ci.update(data)
+}
+
+// update replaces the underlying cluster data with the provided new data if it decodes
+func (ci *ClusterImporter) update(data []byte) {
+	ce := new(clusterEncoding)
+	err := json.Unmarshal(data, ce)
+	if err != nil {
+		log.Warningf("Failed to unmarshal cluster during import: %s", err)
+		return
+	}
+
+	ci.dataLock.Lock()
+	ci.data = ce
+	ci.dataLock.Unlock()
+}
+
+// Run starts the watcher processes
+func (ci *ClusterImporter) Run() {
+	if ci.source == nil {
+		log.Errorf("ClusterImporter source does not exist, not running")
+		return
+	}
+
+	exists, err := ci.source.Exists()
+	if err != nil {
+		log.Errorf("Failed to import source for cluster: %s", err)
+		return
+	}
+
+	if exists {
+		data, err := ci.source.Read()
+		if err != nil {
+			log.Warningf("Failed to import cluster: %s", err)
+		} else {
+			ci.update(data)
+		}
+	}
+
+	ci.sourceHandlerID = ci.source.AddChangeHandler(ci.onImportSourceChanged)
+}
+
+// Stops the watcher processes
+func (ci *ClusterImporter) Stop() {
+	if ci.sourceHandlerID != "" {
+		ci.source.RemoveChangeHandler(ci.sourceHandlerID)
+		ci.sourceHandlerID = ""
+	}
+}
+
+// GetAllNamespaces returns all the cached namespaces
+func (ci *ClusterImporter) GetAllNamespaces() []*v1.Namespace {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	namespaces := ci.data.Namespaces
+	cloneList := make([]*v1.Namespace, 0, len(namespaces))
+	for _, v := range namespaces {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllNodes returns all the cached nodes
+func (ci *ClusterImporter) GetAllNodes() []*v1.Node {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	nodes := ci.data.Nodes
+	cloneList := make([]*v1.Node, 0, len(nodes))
+	for _, v := range nodes {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllPods returns all the cached pods
+func (ci *ClusterImporter) GetAllPods() []*v1.Pod {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	pods := ci.data.Pods
+	cloneList := make([]*v1.Pod, 0, len(pods))
+	for _, v := range pods {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllServices returns all the cached services
+func (ci *ClusterImporter) GetAllServices() []*v1.Service {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	services := ci.data.Services
+	cloneList := make([]*v1.Service, 0, len(services))
+	for _, v := range services {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllDaemonSets returns all the cached DaemonSets
+func (ci *ClusterImporter) GetAllDaemonSets() []*appsv1.DaemonSet {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	daemonSets := ci.data.DaemonSets
+	cloneList := make([]*appsv1.DaemonSet, 0, len(daemonSets))
+	for _, v := range daemonSets {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllDeployments returns all the cached deployments
+func (ci *ClusterImporter) GetAllDeployments() []*appsv1.Deployment {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	deployments := ci.data.Deployments
+	cloneList := make([]*appsv1.Deployment, 0, len(deployments))
+	for _, v := range deployments {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllStatfulSets returns all the cached StatefulSets
+func (ci *ClusterImporter) GetAllStatefulSets() []*appsv1.StatefulSet {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	statefulSets := ci.data.StatefulSets
+	cloneList := make([]*appsv1.StatefulSet, 0, len(statefulSets))
+	for _, v := range statefulSets {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllReplicaSets returns all the cached ReplicaSets
+func (ci *ClusterImporter) GetAllReplicaSets() []*appsv1.ReplicaSet {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	replicaSets := ci.data.ReplicaSets
+	cloneList := make([]*appsv1.ReplicaSet, 0, len(replicaSets))
+	for _, v := range replicaSets {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllPersistentVolumes returns all the cached persistent volumes
+func (ci *ClusterImporter) GetAllPersistentVolumes() []*v1.PersistentVolume {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	pvs := ci.data.PersistentVolumes
+	cloneList := make([]*v1.PersistentVolume, 0, len(pvs))
+	for _, v := range pvs {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllPersistentVolumeClaims returns all the cached persistent volume claims
+func (ci *ClusterImporter) GetAllPersistentVolumeClaims() []*v1.PersistentVolumeClaim {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	pvcs := ci.data.PersistentVolumeClaims
+	cloneList := make([]*v1.PersistentVolumeClaim, 0, len(pvcs))
+	for _, v := range pvcs {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllStorageClasses returns all the cached storage classes
+func (ci *ClusterImporter) GetAllStorageClasses() []*stv1.StorageClass {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	storageClasses := ci.data.StorageClasses
+	cloneList := make([]*stv1.StorageClass, 0, len(storageClasses))
+	for _, v := range storageClasses {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllJobs returns all the cached jobs
+func (ci *ClusterImporter) GetAllJobs() []*batchv1.Job {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	jobs := ci.data.Jobs
+	cloneList := make([]*batchv1.Job, 0, len(jobs))
+	for _, v := range jobs {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllHorizontalPodAutoscalers() returns all cached horizontal pod autoscalers
+func (ci *ClusterImporter) GetAllHorizontalPodAutoscalers() []*autoscaling.HorizontalPodAutoscaler {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	hpas := ci.data.HorizontalPodAutoscalers
+	cloneList := make([]*autoscaling.HorizontalPodAutoscaler, 0, len(hpas))
+	for _, v := range hpas {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllPodDisruptionBudgets returns all cached pod disruption budgets
+func (ci *ClusterImporter) GetAllPodDisruptionBudgets() []*v1beta1.PodDisruptionBudget {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	pdbs := ci.data.PodDisruptionBudgets
+	cloneList := make([]*v1beta1.PodDisruptionBudget, 0, len(pdbs))
+	for _, v := range pdbs {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// SetConfigMapUpdateFunc sets the configmap update function
+func (ci *ClusterImporter) SetConfigMapUpdateFunc(_ func(interface{})) {
+	// TODO: (bolt) This function is still a bit strange to me for the ClusterCache interface.
+	// TODO: (bolt) no-op for now.
+	log.Warningf("SetConfigMapUpdateFunc is disabled for imported cluster data.")
+}

+ 364 - 0
pkg/config/configfile.go

@@ -0,0 +1,364 @@
+package config
+
+import (
+	"errors"
+	"os"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/storage"
+	"github.com/kubecost/cost-model/pkg/util/atomic"
+)
+
+// HandlerID is a unique identifier assigned to a provided ConfigChangedHandler. This is used to remove a handler
+// from the ConfigFile when it is no longer needed.
+type HandlerID string
+
+//--------------------------------------------------------------------------
+//  ChangeType
+//--------------------------------------------------------------------------
+
+// ChangeType is used to specifically categorize the change that was made on a ConfigFile
+type ChangeType string
+
+// ChangeType constants contain the different types of updates passed through the ConfigChangedHandler
+const (
+	ChangeTypeCreated  ChangeType = "created"
+	ChangeTypeModified ChangeType = "modified"
+	ChangeTypeDeleted  ChangeType = "deleted"
+)
+
+//--------------------------------------------------------------------------
+//  ConfigChangedHandler
+//--------------------------------------------------------------------------
+
+// ConfigChangedHandler is the func handler used to receive change updates about the
+// config file. Both ChangeTypeCreated and ChangeTypeModified yield a valid []byte, while
+// ChangeTypeDeleted yields a nil []byte.
+type ConfigChangedHandler func(ChangeType, []byte)
+
+//--------------------------------------------------------------------------
+//  ConfigFile
+//--------------------------------------------------------------------------
+
+// DefaultHandlerPriority is used as the priority for any handlers added via AddChangeHandler
+const DefaultHandlerPriority int = 1000
+
+// NoBackingStore error is used when the config file's backing storage is missing
+var NoBackingStore error = errors.New("Backing storage does not exist.")
+
+// ConfigFile is representation of a configuration file that can be written to, read, and watched
+// for updates
+type ConfigFile struct {
+	store      storage.Storage
+	file       string
+	dataLock   *sync.Mutex
+	data       []byte
+	watchLock  *sync.Mutex
+	watchers   []*pHandler
+	runState   atomic.AtomicRunState
+	lastChange time.Time
+}
+
+// NewConfigFile creates a new ConfigFile instance using a specific storage.Storage and path relative
+// to the storage.
+func NewConfigFile(store storage.Storage, file string) *ConfigFile {
+	return &ConfigFile{
+		store:     store,
+		file:      file,
+		dataLock:  new(sync.Mutex),
+		data:      nil,
+		watchLock: new(sync.Mutex),
+	}
+}
+
+// Path returns the fully qualified path of the config file.
+func (cf *ConfigFile) Path() string {
+	if cf.store == nil {
+		return cf.file
+	}
+
+	return cf.store.FullPath(cf.file)
+}
+
+// Write will write the binary data to the file.
+func (cf *ConfigFile) Write(data []byte) error {
+	if cf.store == nil {
+		return NoBackingStore
+	}
+
+	e := cf.store.Write(cf.file, data)
+	// update cache on successful write
+	if e == nil {
+		cf.dataLock.Lock()
+		cf.data = data
+		cf.dataLock.Unlock()
+	}
+	return e
+}
+
+// Read will read the binary data from the file and return it. If an error is returned,
+// the byte array will be nil.
+func (cf *ConfigFile) Read() ([]byte, error) {
+	return cf.internalRead(false)
+}
+
+// internalRead is used to allow a forced override of data cache to refresh data
+func (cf *ConfigFile) internalRead(force bool) ([]byte, error) {
+	if cf.store == nil {
+		return nil, NoBackingStore
+	}
+
+	cf.dataLock.Lock()
+	defer cf.dataLock.Unlock()
+	if !force {
+		if cf.data != nil {
+			return cf.data, nil
+		}
+	}
+
+	d, e := cf.store.Read(cf.file)
+	if e != nil {
+		return nil, e
+	}
+	cf.data = d
+	return cf.data, nil
+}
+
+// Stat returns the StorageStats for the file.
+func (cf *ConfigFile) Stat() (*storage.StorageInfo, error) {
+	if cf.store == nil {
+		return nil, NoBackingStore
+	}
+
+	return cf.store.Stat(cf.file)
+}
+
+// Exists returns true if the file exist. If an error other than a NotExist error is returned,
+// the result will be false with the provided error.
+func (cf *ConfigFile) Exists() (bool, error) {
+	if cf.store == nil {
+		return false, NoBackingStore
+	}
+
+	return cf.store.Exists(cf.file)
+}
+
+// Delete removes the file from storage permanently.
+func (cf *ConfigFile) Delete() error {
+	if cf.store == nil {
+		return NoBackingStore
+	}
+
+	e := cf.store.Remove(cf.file)
+
+	// on removal, clear data cache
+	if e == nil {
+		cf.dataLock.Lock()
+		cf.data = nil
+		cf.dataLock.Unlock()
+	}
+	return e
+}
+
+// Refresh allows external callers to force reload the config file from internal storage. This is
+// particularly useful when there exist no change listeners on the config, which would prevent the
+// data cache from automatically updating on change
+func (cf *ConfigFile) Refresh() ([]byte, error) {
+	return cf.internalRead(true)
+}
+
+// AddChangeHandler accepts a ConfigChangedHandler function which will be called whenever the implementation
+// detects that a change has been made. A unique HandlerID is returned that can be used to remove the handler
+// if necessary.
+func (cf *ConfigFile) AddChangeHandler(handler ConfigChangedHandler) HandlerID {
+	return cf.AddPriorityChangeHandler(handler, DefaultHandlerPriority)
+}
+
+// AddPriorityChangeHandler allows adding a config change handler with a specific priority. By default,
+// any handlers added via AddChangeHandler have a default priority of 1000. The lower the priority, the
+// sooner in the handler execution it will be called.
+func (cf *ConfigFile) AddPriorityChangeHandler(handler ConfigChangedHandler, priority int) HandlerID {
+	cf.watchLock.Lock()
+	defer cf.watchLock.Unlock()
+
+	h := &pHandler{
+		id:       HandlerID(uuid.NewString()),
+		handler:  handler,
+		priority: priority,
+	}
+
+	cf.watchers = append(cf.watchers, h)
+
+	// create the actual file watcher once we have at least one active watcher func registered
+	if len(cf.watchers) == 1 {
+		cf.runWatcher()
+	}
+
+	return h.id
+}
+
+// RemoveChangeHandler removes the change handler with the provided identifier if it exists. True
+// is returned if the handler was removed (it existed), false otherwise.
+func (cf *ConfigFile) RemoveChangeHandler(id HandlerID) bool {
+	cf.watchLock.Lock()
+	defer cf.watchLock.Unlock()
+
+	for i := range cf.watchers {
+		if cf.watchers[i] != nil && cf.watchers[i].id == id {
+			copy(cf.watchers[i:], cf.watchers[i+1:])
+			cf.watchers[len(cf.watchers)-1] = nil
+			cf.watchers = cf.watchers[:len(cf.watchers)-1]
+
+			// stop watching the file for changes if there are no more external watchers
+			if len(cf.watchers) == 0 {
+				cf.stopWatcher()
+			}
+
+			return true
+		}
+	}
+	return false
+}
+
+// RemoveAllHandlers removes all added handlers
+func (cf *ConfigFile) RemoveAllHandlers() {
+	cf.watchLock.Lock()
+	defer cf.watchLock.Unlock()
+
+	cf.watchers = nil
+
+	cf.stopWatcher()
+}
+
+// runWatcher creates a go routine which will poll the stat of a storage target on a specific
+// interval and dispatch created, modified, and deleted events for that file.
+func (cf *ConfigFile) runWatcher() {
+	// we wait for a reset on the run state prior to starting, which
+	// will only block iff the run state is in the process of stopping
+	cf.runState.WaitForReset()
+
+	// if start fails after waiting for a reset, it means that another thread
+	// beat this thread to the start
+	if !cf.runState.Start() {
+		log.Warningf("Run watcher already running for file: %s", cf.file)
+		return
+	}
+
+	go func() {
+		first := true
+
+		var last time.Time
+		var exists bool
+
+		for {
+			// Each iteration, check for the stop trigger, or wait 10 seconds
+			select {
+			case <-cf.runState.OnStop():
+				cf.runState.Reset()
+				return
+			case <-time.After(10 * time.Second):
+			}
+
+			// Query stat on the file, on errors other than exists,
+			// we'll need to log the error, and perhaps limit the retries
+			st, err := cf.Stat()
+			if err != nil && !os.IsNotExist(err) {
+				log.Errorf("Storage Stat Error: %s", err)
+				continue
+			}
+
+			// On first iteration, set exists and last modification time (if applicable)
+			// and flip flag
+			if first {
+				exists = !os.IsNotExist(err)
+				if exists {
+					last = st.ModTime
+				}
+				first = false
+				continue
+			}
+
+			// File does not exist in storage, need to check to see if that is different
+			// from last state check
+			if os.IsNotExist(err) {
+				// check to see if the file has gone from exists to !exists
+				if exists {
+					exists = false
+					cf.onFileChange(ChangeTypeDeleted, nil)
+				}
+				continue
+			}
+
+			// check to see if the file has gone from !exists to exists
+			if !exists {
+				data, err := cf.internalRead(true)
+				if err != nil {
+					log.Warningf("Read() Error: %s\n", err)
+					continue
+				}
+				exists = true
+				last = st.ModTime
+				cf.onFileChange(ChangeTypeCreated, data)
+				continue
+			}
+
+			mtime := st.ModTime
+			if mtime != last {
+				last = mtime
+				data, err := cf.internalRead(true)
+				if err != nil {
+					log.Errorf("Read() Error: %s\n", err)
+					continue
+				}
+				cf.onFileChange(ChangeTypeModified, data)
+			}
+		}
+	}()
+}
+
+// stopWatcher closes the stop channel, returning from the runWatcher go routine. Allows us
+// to remove any polling stat checks on files when there are no change handlers.
+func (cf *ConfigFile) stopWatcher() {
+	cf.runState.Stop()
+}
+
+// onFileChange is internally called when the core watcher recognizes a change in the ConfigFile. This
+// method dispatches that change to all added watchers
+func (cf *ConfigFile) onFileChange(changeType ChangeType, newData []byte) {
+	// On change, we copy out the handlers to a separate slice for processing for a few reasons:
+	// 1. We don't want to lock while executing the handlers
+	// 2. Handlers may want to operate on the ConfigFile instance, which would result in a deadlock
+	// 3. Allows us to implement priority sorting outside of the lock as well
+	cf.watchLock.Lock()
+	if len(cf.watchers) == 0 {
+		cf.watchLock.Unlock()
+		return
+	}
+
+	toNotify := make([]*pHandler, len(cf.watchers))
+	copy(toNotify, cf.watchers)
+	cf.watchLock.Unlock()
+
+	sort.SliceStable(toNotify, func(i, j int) bool {
+		return toNotify[i].priority < toNotify[j].priority
+	})
+
+	for _, handler := range toNotify {
+		handler.handler(changeType, newData)
+	}
+}
+
+//--------------------------------------------------------------------------
+//  pHandler
+//--------------------------------------------------------------------------
+
+// pHandler is a wrapper type used to assign a ConfigChangedHandler a unique identifier and priority.
+type pHandler struct {
+	id       HandlerID
+	handler  ConfigChangedHandler
+	priority int
+}

+ 98 - 0
pkg/config/configmanager.go

@@ -0,0 +1,98 @@
+package config
+
+import (
+	"io/ioutil"
+	"sync"
+
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/storage"
+)
+
+//--------------------------------------------------------------------------
+//  ConfigFileManagerOpts
+//--------------------------------------------------------------------------
+
+// ConfigFileManagerOpts describes how to configure the ConfigFileManager for
+// serving configuration files
+type ConfigFileManagerOpts struct {
+	// BucketStoreConfig is the local file location for the configuration used to
+	// write and read configuration data to/from the bucket. The format of this
+	// configuration file should be compatible with storage.NewBucketStorage
+	BucketStoreConfig string
+
+	// LocalConfigPath provides a backup location for storing the configuration
+	// files
+	LocalConfigPath string
+}
+
+// IsBucketStorageEnabled returns true if bucket storage is enabled.
+func (cfmo *ConfigFileManagerOpts) IsBucketStorageEnabled() bool {
+	return cfmo.BucketStoreConfig != ""
+}
+
+// DefaultConfigFileManagerOpts returns the default configuration options for the
+// config file manager
+func DefaultConfigFileManagerOpts() *ConfigFileManagerOpts {
+	return &ConfigFileManagerOpts{
+		BucketStoreConfig: "",
+		LocalConfigPath:   "/",
+	}
+}
+
+//--------------------------------------------------------------------------
+//  ConfigFileManager
+//--------------------------------------------------------------------------
+
+// ConfigFileManager is a fascade for a central API used to create and watch
+// config files.
+type ConfigFileManager struct {
+	lock  *sync.Mutex
+	store storage.Storage
+	files map[string]*ConfigFile
+}
+
+// NewConfigFileManager creates a new backing storage and configuration file manager
+func NewConfigFileManager(opts *ConfigFileManagerOpts) *ConfigFileManager {
+	if opts == nil {
+		opts = DefaultConfigFileManagerOpts()
+	}
+
+	var configStore storage.Storage
+	if opts.IsBucketStorageEnabled() {
+		bucketConfig, err := ioutil.ReadFile(opts.BucketStoreConfig)
+		if err != nil {
+			log.Warningf("Failed to initialize config bucket storage: %s", err)
+		} else {
+			bucketStore, err := storage.NewBucketStorage(bucketConfig)
+			if err != nil {
+				log.Warningf("Failed to create config bucket storage: %s", err)
+			} else {
+				configStore = bucketStore
+			}
+		}
+	} else {
+		configStore = storage.NewFileStorage(opts.LocalConfigPath)
+	}
+
+	return &ConfigFileManager{
+		lock:  new(sync.Mutex),
+		store: configStore,
+		files: make(map[string]*ConfigFile),
+	}
+}
+
+// ConfigFileAt returns an existing configuration file for the provided path if it exists. Otherwise,
+// a new instance is created and returned. Note that the path does not have to exist in order for the
+// instance to be created. It can exist as a potential file path on the storage, and be written to
+// later
+func (cfm *ConfigFileManager) ConfigFileAt(path string) *ConfigFile {
+	cfm.lock.Lock()
+	defer cfm.lock.Unlock()
+	if cf, ok := cfm.files[path]; ok {
+		return cf
+	}
+
+	cf := NewConfigFile(cfm.store, path)
+	cfm.files[path] = cf
+	return cf
+}

+ 22 - 0
pkg/costmodel/allocation.go

@@ -585,6 +585,16 @@ func applyPodResults(window kubecost.Window, resolution time.Duration, podMap ma
 		allocStart = allocStart.Add(time.Duration(startAdjustmentCoeff*100) * resolution / time.Duration(100))
 		allocEnd = allocEnd.Add(-time.Duration(endAdjustmentCoeff*100) * resolution / time.Duration(100))
 
+		// Ensure that the allocStart is always within the window, adjusting
+		// for the occasions where start falls 1m before the query window.
+		// NOTE: window here will always be closed (so no need to nil check
+		// "start").
+		// TODO:CLEANUP revisit query methodology to figure out why this is
+		// happening on occasion
+		if allocStart.Before(*window.Start()) {
+			allocStart = *window.Start()
+		}
+
 		// If there is only one point with a value <= 0.5 that the start and
 		// end timestamps both share, then we will enter this case because at
 		// least half of a resolution will be subtracted from both the start
@@ -598,6 +608,18 @@ func applyPodResults(window kubecost.Window, resolution time.Duration, podMap ma
 			allocEnd = allocEnd.Add(time.Duration(50*endAdjustmentCoeff) * resolution / time.Duration(100))
 		}
 
+		// Ensure that the allocEnf is always within the window, adjusting
+		// for the occasions where end falls 1m after the query window. This
+		// has not ever happened, but is symmetrical with the start check
+		// above.
+		// NOTE: window here will always be closed (so no need to nil check
+		// "end").
+		// TODO:CLEANUP revisit query methodology to figure out why this is
+		// happening on occasion
+		if allocEnd.After(*window.End()) {
+			allocEnd = *window.End()
+		}
+
 		// Set start if unset or this datum's start time is earlier than the
 		// current earliest time.
 		if _, ok := clusterStart[cluster]; !ok || allocStart.Before(clusterStart[cluster]) {

+ 3 - 2
pkg/costmodel/cluster_helpers_test.go

@@ -1,6 +1,7 @@
 package costmodel
 
 import (
+	"github.com/kubecost/cost-model/pkg/config"
 	"reflect"
 	"testing"
 	"time"
@@ -852,7 +853,7 @@ func TestBuildGPUCostMap(t *testing.T) {
 	for _, testCase := range cases {
 		t.Run(testCase.name, func(t *testing.T) {
 			testProvider := &cloud.CustomProvider{
-				Config: cloud.NewProviderConfig("fakeFile"),
+				Config: cloud.NewProviderConfig(config.NewConfigFileManager(nil),"fakeFile"),
 			}
 			testPreemptible := make(map[NodeIdentifier]bool)
 			result, _ := buildGPUCostMap(testCase.promResult, testCase.countMap, testProvider, testPreemptible)
@@ -980,7 +981,7 @@ func TestAssetCustompricing(t *testing.T) {
 	for _, testCase := range cases {
 		t.Run(testCase.name, func(t *testing.T) {
 			testProvider := &cloud.CustomProvider{
-				Config: cloud.NewProviderConfig(""),
+				Config: cloud.NewProviderConfig(config.NewConfigFileManager(nil),""),
 			}
 			testProvider.UpdateConfigFromConfigMap(testCase.customPricingMap)
 

+ 82 - 21
pkg/costmodel/clusterinfo.go

@@ -4,9 +4,12 @@ import (
 	"fmt"
 
 	cloudProvider "github.com/kubecost/cost-model/pkg/cloud"
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/costmodel/clusters"
 	"github.com/kubecost/cost-model/pkg/env"
+	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/thanos"
+	"github.com/kubecost/cost-model/pkg/util/json"
 
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/klog"
@@ -33,6 +36,7 @@ func writeClusterProfile(clusterInfo map[string]string) {
 	clusterInfo["clusterProfile"] = clusterProfile
 }
 
+// writeThanosFlags includes the configured thanos flags on the cluster info
 func writeThanosFlags(clusterInfo map[string]string) {
 	// Include Thanos Offset Duration if Applicable
 	clusterInfo["thanosEnabled"] = fmt.Sprintf("%t", thanos.IsEnabled())
@@ -41,38 +45,22 @@ func writeThanosFlags(clusterInfo map[string]string) {
 	}
 }
 
-// default local cluster info provider implementation which provides an instanced object for
-// getting the local cluster info
-type defaultLocalClusterInfoProvider struct {
+// localClusterInfoProvider gets the local cluster info from the cloud provider and kubernetes
+type localClusterInfoProvider struct {
 	k8s      kubernetes.Interface
 	provider cloudProvider.Provider
 }
 
 // GetClusterInfo returns a string map containing the local cluster info
-func (dlcip *defaultLocalClusterInfoProvider) GetClusterInfo() map[string]string {
-	return GetClusterInfo(dlcip.k8s, dlcip.provider)
-}
-
-// NewLocalClusterInfoProvider creates a new clusters.LocalClusterInfoProvider implementation for providing local
-// cluster information
-func NewLocalClusterInfoProvider(k8s kubernetes.Interface, cloud cloudProvider.Provider) clusters.LocalClusterInfoProvider {
-	return &defaultLocalClusterInfoProvider{
-		k8s:      k8s,
-		provider: cloud,
-	}
-}
-
-// GetClusterInfo provides specific information about the cluster cloud provider as well as
-// generic configuration values.
-func GetClusterInfo(kubeClient kubernetes.Interface, cloud cloudProvider.Provider) map[string]string {
-	data, err := cloud.ClusterInfo()
+func (dlcip *localClusterInfoProvider) GetClusterInfo() map[string]string {
+	data, err := dlcip.provider.ClusterInfo()
 
 	// Ensure we create the info object if it doesn't exist
 	if data == nil {
 		data = make(map[string]string)
 	}
 
-	kc, ok := kubeClient.(*kubernetes.Clientset)
+	kc, ok := dlcip.k8s.(*kubernetes.Clientset)
 	if ok && data != nil {
 		v, err := kc.ServerVersion()
 		if err != nil {
@@ -90,3 +78,76 @@ func GetClusterInfo(kubeClient kubernetes.Interface, cloud cloudProvider.Provide
 
 	return data
 }
+
+// NewLocalClusterInfoProvider creates a new clusters.LocalClusterInfoProvider implementation for providing local
+// cluster information
+func NewLocalClusterInfoProvider(k8s kubernetes.Interface, cloud cloudProvider.Provider) clusters.ClusterInfoProvider {
+	return &localClusterInfoProvider{
+		k8s:      k8s,
+		provider: cloud,
+	}
+}
+
+// configuredClusterInfoProvider just provides the cluster info directly from the config file source.
+type configuredClusterInfoProvider struct {
+	config *config.ConfigFile
+}
+
+// GetClusterInfo returns a string map containing the local cluster info
+func (ccip *configuredClusterInfoProvider) GetClusterInfo() map[string]string {
+	clusterInfo := map[string]string{}
+
+	data, err := ccip.config.Refresh()
+	if err != nil {
+		return clusterInfo
+	}
+
+	err = json.Unmarshal(data, &clusterInfo)
+	if err != nil {
+		log.Warningf("ClusterInfo failed to load from configuration: %s", err)
+		return clusterInfo
+	}
+
+	return clusterInfo
+}
+
+// NewConfiguredClusterInfoProvider instantiates and returns a cluster info provider which loads cluster info from
+// a config file.
+func NewConfiguredClusterInfoProvider(config *config.ConfigFile) clusters.ClusterInfoProvider {
+	return &configuredClusterInfoProvider{
+		config: config,
+	}
+}
+
+// clusterInfoWriteOnRequest writes the cluster info result to a config whenever it's requested
+type clusterInfoWriteOnRequest struct {
+	clusterInfo clusters.ClusterInfoProvider
+	config      *config.ConfigFile
+}
+
+// GetClusterInfo returns a string map containing the local cluster info
+func (ciw *clusterInfoWriteOnRequest) GetClusterInfo() map[string]string {
+	cInfo := ciw.clusterInfo.GetClusterInfo()
+
+	result, err := json.Marshal(cInfo)
+	if err != nil {
+		log.Warningf("Failed to write the cluster info: %s", err)
+		return cInfo
+	}
+
+	err = ciw.config.Write(result)
+	if err != nil {
+		log.Warningf("Failed to write the cluster info to config: %s", err)
+	}
+
+	return cInfo
+}
+
+// NewClusterInfoWriteOnRequest instantiates and returns a cluster info provider which writes the cluster info to a configuration
+// before each request.
+func NewClusterInfoWriteOnRequest(clusterInfo clusters.ClusterInfoProvider, config *config.ConfigFile) clusters.ClusterInfoProvider {
+	return &clusterInfoWriteOnRequest{
+		clusterInfo: clusterInfo,
+		config:      config,
+	}
+}

+ 23 - 24
pkg/costmodel/clusters/clustermap.go

@@ -7,7 +7,6 @@ import (
 	"sync"
 	"time"
 
-	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/prom"
 	"github.com/kubecost/cost-model/pkg/thanos"
@@ -76,31 +75,31 @@ type ClusterMap interface {
 	StopRefresh()
 }
 
-// LocalClusterInfoProvider is a contract which is capable of performing local cluster info lookups.
-type LocalClusterInfoProvider interface {
-	// GetClusterInfo returns a string map containing the local cluster info
+// ClusterInfoProvider is a contract which is capable of performing cluster info lookups.
+type ClusterInfoProvider interface {
+	// GetClusterInfo returns a string map containing the local/remote connected cluster info
 	GetClusterInfo() map[string]string
 }
 
 // ClusterMap keeps records of all known cost-model clusters.
 type PrometheusClusterMap struct {
-	lock         *sync.RWMutex
-	client       prometheus.Client
-	clusters     map[string]*ClusterInfo
-	localCluster LocalClusterInfoProvider
-	stop         chan struct{}
+	lock        *sync.RWMutex
+	client      prometheus.Client
+	clusters    map[string]*ClusterInfo
+	clusterInfo ClusterInfoProvider
+	stop        chan struct{}
 }
 
 // NewClusterMap creates a new ClusterMap implementation using a prometheus or thanos client
-func NewClusterMap(client prometheus.Client, lcip LocalClusterInfoProvider, refresh time.Duration) ClusterMap {
+func NewClusterMap(client prometheus.Client, cip ClusterInfoProvider, refresh time.Duration) ClusterMap {
 	stop := make(chan struct{})
 
 	cm := &PrometheusClusterMap{
-		lock:         new(sync.RWMutex),
-		client:       client,
-		clusters:     make(map[string]*ClusterInfo),
-		localCluster: lcip,
-		stop:         stop,
+		lock:        new(sync.RWMutex),
+		client:      client,
+		clusters:    make(map[string]*ClusterInfo),
+		clusterInfo: cip,
+		stop:        stop,
 	}
 
 	// Run an updater to ensure cluster data stays relevant over time
@@ -210,14 +209,14 @@ func (pcm *PrometheusClusterMap) loadClusters() (map[string]*ClusterInfo, error)
 	}
 
 	// populate the local cluster if it doesn't exist
-	localID := env.GetClusterID()
-	if _, ok := clusters[localID]; !ok {
-		localInfo, err := pcm.getLocalClusterInfo()
-		if err != nil {
-			log.Warningf("Failed to load local cluster info: %s", err)
-		} else {
-			clusters[localInfo.ID] = localInfo
-		}
+	localInfo, err := pcm.getLocalClusterInfo()
+	if err != nil {
+		return clusters, nil
+	}
+
+	// Check to see if the local cluster's id is part of our loaded clusters, and include if not
+	if _, ok := clusters[localInfo.ID]; !ok {
+		clusters[localInfo.ID] = localInfo
 	}
 
 	return clusters, nil
@@ -225,7 +224,7 @@ func (pcm *PrometheusClusterMap) loadClusters() (map[string]*ClusterInfo, error)
 
 // getLocalClusterInfo returns the local cluster info in the event there does not exist a metric available.
 func (pcm *PrometheusClusterMap) getLocalClusterInfo() (*ClusterInfo, error) {
-	info := pcm.localCluster.GetClusterInfo()
+	info := pcm.clusterInfo.GetClusterInfo()
 
 	var id string
 	var name string

+ 19 - 52
pkg/costmodel/metrics.go

@@ -9,20 +9,20 @@ import (
 
 	"github.com/kubecost/cost-model/pkg/cloud"
 	"github.com/kubecost/cost-model/pkg/clustercache"
+	"github.com/kubecost/cost-model/pkg/costmodel/clusters"
 	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/errors"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/metrics"
 	"github.com/kubecost/cost-model/pkg/prom"
 	"github.com/kubecost/cost-model/pkg/util"
+	"github.com/kubecost/cost-model/pkg/util/atomic"
 
 	promclient "github.com/prometheus/client_golang/api"
 	"github.com/prometheus/client_golang/prometheus"
 	dto "github.com/prometheus/client_model/go"
 	v1 "k8s.io/api/core/v1"
 
-	"k8s.io/client-go/kubernetes"
-
 	"k8s.io/klog"
 )
 
@@ -32,8 +32,7 @@ import (
 
 // ClusterInfoCollector is a prometheus collector that generates ClusterInfoMetrics
 type ClusterInfoCollector struct {
-	Cloud         cloud.Provider
-	KubeClientSet kubernetes.Interface
+	ClusterInfo clusters.ClusterInfoProvider
 }
 
 // Describe sends the super-set of all possible descriptors of metrics
@@ -44,7 +43,7 @@ func (cic ClusterInfoCollector) Describe(ch chan<- *prometheus.Desc) {
 
 // Collect is called by the Prometheus registry when collecting metrics.
 func (cic ClusterInfoCollector) Collect(ch chan<- prometheus.Metric) {
-	clusterInfo := GetClusterInfo(cic.KubeClientSet, cic.Cloud)
+	clusterInfo := cic.ClusterInfo.GetClusterInfo()
 	labels := prom.MapToLabels(clusterInfo)
 
 	m := newClusterInfoMetric("kubecost_cluster_info", labels)
@@ -126,7 +125,7 @@ var (
 )
 
 // initCostModelMetrics uses a sync.Once to ensure that these metrics are only created once
-func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider cloud.Provider) {
+func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider cloud.Provider, clusterInfo clusters.ClusterInfoProvider) {
 	metricsInit.Do(func() {
 		cpuGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "node_cpu_hourly_cost",
@@ -216,8 +215,7 @@ func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider cloud
 
 		// General Metric Collectors
 		prometheus.MustRegister(ClusterInfoCollector{
-			KubeClientSet: clusterCache.GetClient(),
-			Cloud:         provider,
+			ClusterInfo: clusterInfo,
 		})
 	})
 }
@@ -252,16 +250,14 @@ type CostModelMetricsEmitter struct {
 	NetworkRegionEgressRecorder   prometheus.Gauge
 	NetworkInternetEgressRecorder prometheus.Gauge
 
-	// Flow Control
-	recordingLock     *sync.Mutex
-	recordingStopping bool
-	recordingStop     chan bool
+	// Concurrent Flow Control - Manages the run state of the metric emitter
+	runState atomic.AtomicRunState
 }
 
 // NewCostModelMetricsEmitter creates a new cost-model metrics emitter. Use Start() to begin metric emission.
-func NewCostModelMetricsEmitter(promClient promclient.Client, clusterCache clustercache.ClusterCache, provider cloud.Provider, model *CostModel) *CostModelMetricsEmitter {
+func NewCostModelMetricsEmitter(promClient promclient.Client, clusterCache clustercache.ClusterCache, provider cloud.Provider, clusterInfo clusters.ClusterInfoProvider, model *CostModel) *CostModelMetricsEmitter {
 	// init will only actually execute once to register the custom gauges
-	initCostModelMetrics(clusterCache, provider)
+	initCostModelMetrics(clusterCache, provider, clusterInfo)
 
 	metrics.InitKubeMetrics(clusterCache, &metrics.KubeMetricsOpts{
 		EmitKubecostControllerMetrics: true,
@@ -292,33 +288,12 @@ func NewCostModelMetricsEmitter(promClient promclient.Client, clusterCache clust
 		NetworkInternetEgressRecorder: networkInternetEgressCostG,
 		ClusterManagementCostRecorder: clusterManagementCostGv,
 		LBCostRecorder:                lbCostGv,
-		recordingLock:                 new(sync.Mutex),
-		recordingStopping:             false,
-		recordingStop:                 nil,
-	}
-}
-
-// Checks to see if there is a metric recording stop channel. If it exists, a new
-// channel is not created and false is returned. If it doesn't exist, a new channel
-// is created and true is returned.
-func (cmme *CostModelMetricsEmitter) checkOrCreateRecordingChan() bool {
-	cmme.recordingLock.Lock()
-	defer cmme.recordingLock.Unlock()
-
-	if cmme.recordingStop != nil {
-		return false
 	}
-
-	cmme.recordingStop = make(chan bool, 1)
-	return true
 }
 
 // IsRunning returns true if metric recording is running.
 func (cmme *CostModelMetricsEmitter) IsRunning() bool {
-	cmme.recordingLock.Lock()
-	defer cmme.recordingLock.Unlock()
-
-	return cmme.recordingStop != nil
+	return cmme.runState.IsRunning()
 }
 
 // NodeCostAverages tracks a running average of a node's cost attributes.
@@ -333,10 +308,11 @@ type NodeCostAverages struct {
 // StartCostModelMetricRecording starts the go routine that emits metrics used to determine
 // cluster costs.
 func (cmme *CostModelMetricsEmitter) Start() bool {
-	// Check to see if we're already recording
-	// This function will create the stop recording channel and return true
-	// if it doesn't exist.
-	if !cmme.checkOrCreateRecordingChan() {
+	// wait for a reset to prevent a race between start and stop calls
+	cmme.runState.WaitForReset()
+
+	// Check to see if we're already recording, and atomically advance the run state to start if we're not
+	if !cmme.runState.Start() {
 		log.Errorf("Attempted to start cost model metric recording when it's already running.")
 		return false
 	}
@@ -699,11 +675,8 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 
 			select {
 			case <-time.After(time.Minute):
-			case <-cmme.recordingStop:
-				cmme.recordingLock.Lock()
-				cmme.recordingStopping = false
-				cmme.recordingStop = nil
-				cmme.recordingLock.Unlock()
+			case <-cmme.runState.OnStop():
+				cmme.runState.Reset()
 				return
 			}
 		}
@@ -715,11 +688,5 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 // Stop halts the metrics emission loop after the current emission is completed
 // or if the emission is paused.
 func (cmme *CostModelMetricsEmitter) Stop() {
-	cmme.recordingLock.Lock()
-	defer cmme.recordingLock.Unlock()
-
-	if !cmme.recordingStopping && cmme.recordingStop != nil {
-		cmme.recordingStopping = true
-		close(cmme.recordingStop)
-	}
+	cmme.runState.Stop()
 }

+ 163 - 123
pkg/costmodel/router.go

@@ -7,13 +7,13 @@ import (
 	"fmt"
 	"io/ioutil"
 	"net/http"
-	"os"
 	"reflect"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/services"
 	"github.com/kubecost/cost-model/pkg/util/httputil"
 	"github.com/kubecost/cost-model/pkg/util/timeutil"
@@ -23,8 +23,6 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/klog"
 
-	k8serrors "k8s.io/apimachinery/pkg/api/errors"
-
 	"github.com/julienschmidt/httprouter"
 
 	sentry "github.com/getsentry/sentry-go"
@@ -41,6 +39,7 @@ import (
 	"github.com/kubecost/cost-model/pkg/util/json"
 	prometheus "github.com/prometheus/client_golang/api"
 	prometheusAPI "github.com/prometheus/client_golang/api/prometheus/v1"
+	appsv1 "k8s.io/api/apps/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	"github.com/patrickmn/go-cache"
@@ -72,20 +71,23 @@ var (
 // Accesses defines a singleton application instance, providing access to
 // Prometheus, Kubernetes, the cloud provider, and caches.
 type Accesses struct {
-	Router            *httprouter.Router
-	PrometheusClient  prometheus.Client
-	ThanosClient      prometheus.Client
-	KubeClientSet     kubernetes.Interface
-	ClusterMap        clusters.ClusterMap
-	CloudProvider     cloud.Provider
-	Model             *CostModel
-	MetricsEmitter    *CostModelMetricsEmitter
-	OutOfClusterCache *cache.Cache
-	AggregateCache    *cache.Cache
-	CostDataCache     *cache.Cache
-	ClusterCostsCache *cache.Cache
-	CacheExpiration   map[time.Duration]time.Duration
-	AggAPI            Aggregator
+	Router              *httprouter.Router
+	PrometheusClient    prometheus.Client
+	ThanosClient        prometheus.Client
+	KubeClientSet       kubernetes.Interface
+	ClusterCache        clustercache.ClusterCache
+	ClusterMap          clusters.ClusterMap
+	CloudProvider       cloud.Provider
+	ConfigFileManager   *config.ConfigFileManager
+	ClusterInfoProvider clusters.ClusterInfoProvider
+	Model               *CostModel
+	MetricsEmitter      *CostModelMetricsEmitter
+	OutOfClusterCache   *cache.Cache
+	AggregateCache      *cache.Cache
+	CostDataCache       *cache.Cache
+	ClusterCostsCache   *cache.Cache
+	CacheExpiration     map[time.Duration]time.Duration
+	AggAPI              Aggregator
 	// SettingsCache stores current state of app settings
 	SettingsCache *cache.Cache
 	// settingsSubscribers tracks channels through which changes to different
@@ -352,6 +354,14 @@ func WrapDataWithMessageAndWarning(data interface{}, err error, message, warning
 	return resp
 }
 
+// wrapAsObjectItems wraps a slice of items into an object containing a single items list
+// allows our k8s proxy methods to emulate a List() request to k8s API
+func wrapAsObjectItems(items interface{}) map[string]interface{} {
+	return map[string]interface{}{
+		"items": items,
+	}
+}
+
 // RefreshPricingData needs to be called when a new node joins the fleet, since we cache the relevant subsets of pricing data to avoid storing the whole thing.
 func (a *Accesses) RefreshPricingData(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 	w.Header().Set("Content-Type", "application/json")
@@ -694,7 +704,7 @@ func (a *Accesses) ClusterInfo(w http.ResponseWriter, r *http.Request, ps httpro
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	data := GetClusterInfo(a.KubeClientSet, a.CloudProvider)
+	data := a.ClusterInfoProvider.GetClusterInfo()
 
 	w.Write(WrapData(data, nil))
 }
@@ -928,12 +938,9 @@ func (a *Accesses) GetAllPersistentVolumes(w http.ResponseWriter, r *http.Reques
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	pvList, err := a.KubeClientSet.CoreV1().PersistentVolumes().List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting persistent volume %v\n", err)
-	}
+	pvList := a.ClusterCache.GetAllPersistentVolumes()
 
-	body, err := json.Marshal(pvList)
+	body, err := json.Marshal(wrapAsObjectItems(pvList))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding persistent volumes: "+err.Error())
 	} else {
@@ -945,12 +952,28 @@ func (a *Accesses) GetAllPersistentVolumes(w http.ResponseWriter, r *http.Reques
 func (a *Accesses) GetAllDeployments(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
-	namespace := r.URL.Query().Get("namespace")
-	deploymentsList, err := a.KubeClientSet.AppsV1().Deployments(namespace).List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting deployments %v\n", err)
+
+	qp := httputil.NewQueryParams(r.URL.Query())
+
+	namespace := qp.Get("namespace", "")
+
+	deploymentsList := a.ClusterCache.GetAllDeployments()
+
+	// filter for provided namespace
+	var deployments []*appsv1.Deployment
+	if namespace == "" {
+		deployments = deploymentsList
+	} else {
+		deployments = []*appsv1.Deployment{}
+
+		for _, d := range deploymentsList {
+			if d.Namespace == namespace {
+				deployments = append(deployments, d)
+			}
+		}
 	}
-	body, err := json.Marshal(deploymentsList)
+
+	body, err := json.Marshal(wrapAsObjectItems(deployments))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding deployment: "+err.Error())
 	} else {
@@ -962,11 +985,9 @@ func (a *Accesses) GetAllStorageClasses(w http.ResponseWriter, r *http.Request,
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	scList, err := a.KubeClientSet.StorageV1().StorageClasses().List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting storageclasses: "+err.Error())
-	}
-	body, err := json.Marshal(scList)
+	scList := a.ClusterCache.GetAllStorageClasses()
+
+	body, err := json.Marshal(wrapAsObjectItems(scList))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding storageclasses: "+err.Error())
 	} else {
@@ -977,12 +998,28 @@ func (a *Accesses) GetAllStorageClasses(w http.ResponseWriter, r *http.Request,
 func (a *Accesses) GetAllStatefulSets(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
-	namespace := r.URL.Query().Get("namespace")
-	deploymentsList, err := a.KubeClientSet.AppsV1().StatefulSets(namespace).List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting deployments %v\n", err)
+
+	qp := httputil.NewQueryParams(r.URL.Query())
+
+	namespace := qp.Get("namespace", "")
+
+	statefulSetsList := a.ClusterCache.GetAllStatefulSets()
+
+	// filter for provided namespace
+	var statefulSets []*appsv1.StatefulSet
+	if namespace == "" {
+		statefulSets = statefulSetsList
+	} else {
+		statefulSets = []*appsv1.StatefulSet{}
+
+		for _, ss := range statefulSetsList {
+			if ss.Namespace == namespace {
+				statefulSets = append(statefulSets, ss)
+			}
+		}
 	}
-	body, err := json.Marshal(deploymentsList)
+
+	body, err := json.Marshal(wrapAsObjectItems(statefulSets))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding deployment: "+err.Error())
 	} else {
@@ -994,12 +1031,9 @@ func (a *Accesses) GetAllNodes(w http.ResponseWriter, r *http.Request, ps httpro
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	nodeList, err := a.KubeClientSet.CoreV1().Nodes().List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting node %v\n", err)
-	}
+	nodeList := a.ClusterCache.GetAllNodes()
 
-	body, err := json.Marshal(nodeList)
+	body, err := json.Marshal(wrapAsObjectItems(nodeList))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding nodes: "+err.Error())
 	} else {
@@ -1011,12 +1045,9 @@ func (a *Accesses) GetAllPods(w http.ResponseWriter, r *http.Request, ps httprou
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	podlist, err := a.KubeClientSet.CoreV1().Pods("").List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting pod %v\n", err)
-	}
+	podlist := a.ClusterCache.GetAllPods()
 
-	body, err := json.Marshal(podlist)
+	body, err := json.Marshal(wrapAsObjectItems(podlist))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding pods: "+err.Error())
 	} else {
@@ -1028,11 +1059,9 @@ func (a *Accesses) GetAllNamespaces(w http.ResponseWriter, r *http.Request, ps h
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	namespaces, err := a.KubeClientSet.CoreV1().Namespaces().List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting namespaces %v\n", err)
-	}
-	body, err := json.Marshal(namespaces)
+	namespaces := a.ClusterCache.GetAllNamespaces()
+
+	body, err := json.Marshal(wrapAsObjectItems(namespaces))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding deployment: "+err.Error())
 	} else {
@@ -1043,11 +1072,10 @@ func (a *Accesses) GetAllNamespaces(w http.ResponseWriter, r *http.Request, ps h
 func (a *Accesses) GetAllDaemonSets(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
-	daemonSets, err := a.KubeClientSet.AppsV1().DaemonSets("").List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting daemon sets %v\n", err)
-	}
-	body, err := json.Marshal(daemonSets)
+
+	daemonSets := a.ClusterCache.GetAllDaemonSets()
+
+	body, err := json.Marshal(wrapAsObjectItems(daemonSets))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding daemon set: "+err.Error())
 	} else {
@@ -1062,24 +1090,21 @@ func (a *Accesses) GetPod(w http.ResponseWriter, r *http.Request, ps httprouter.
 	podName := ps.ByName("name")
 	podNamespace := ps.ByName("namespace")
 
-	// Examples for error handling:
-	// - Use helper functions like e.g. errors.IsNotFound()
-	// - And/or cast to StatusError and use its properties like e.g. ErrStatus.Message
-	pod, err := a.KubeClientSet.CoreV1().Pods(podNamespace).Get(r.Context(), podName, metav1.GetOptions{})
-	if k8serrors.IsNotFound(err) {
-		fmt.Fprintf(w, "Pod not found\n")
-	} else if statusError, isStatus := err.(*k8serrors.StatusError); isStatus {
-		fmt.Fprintf(w, "Error getting pod %v\n", statusError.ErrStatus.Message)
-	} else if err != nil {
-		fmt.Fprintf(w, "Error getting pod: "+err.Error())
-	} else {
-		body, err := json.Marshal(pod)
-		if err != nil {
-			fmt.Fprintf(w, "Error decoding pod: "+err.Error())
-		} else {
-			w.Write(body)
+	// TODO: ClusterCache API could probably afford to have some better filtering
+	allPods := a.ClusterCache.GetAllPods()
+	for _, pod := range allPods {
+		if pod.Namespace == podNamespace && pod.Name == podName {
+			body, err := json.Marshal(pod)
+			if err != nil {
+				fmt.Fprintf(w, "Error decoding pod: "+err.Error())
+			} else {
+				w.Write(body)
+			}
+			return
 		}
 	}
+
+	fmt.Fprintf(w, "Pod not found\n")
 }
 
 func (a *Accesses) PrometheusRecordingRules(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
@@ -1105,9 +1130,9 @@ func (a *Accesses) PrometheusConfig(w http.ResponseWriter, r *http.Request, _ ht
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	pConfig := make(map[string]string)
-
-	pConfig["address"] = os.Getenv("PROMETHEUS_SERVER_ENDPOINT")
+	pConfig := map[string]string{
+		"address": env.GetPrometheusServerEndpoint(),
+	}
 
 	body, err := json.Marshal(pConfig)
 	if err != nil {
@@ -1140,13 +1165,10 @@ func (a *Accesses) GetOrphanedPods(w http.ResponseWriter, r *http.Request, ps ht
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	podlist, err := a.KubeClientSet.CoreV1().Pods("").List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting pod %v\n", err)
-	}
+	podlist := a.ClusterCache.GetAllPods()
 
-	var lonePods []v1.Pod
-	for _, pod := range podlist.Items {
+	var lonePods []*v1.Pod
+	for _, pod := range podlist {
 		if len(pod.OwnerReferences) == 0 {
 			lonePods = append(lonePods, pod)
 		}
@@ -1164,10 +1186,11 @@ func (a *Accesses) GetInstallNamespace(w http.ResponseWriter, r *http.Request, _
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	ns := os.Getenv("KUBECOST_NAMESPACE")
+	ns := env.GetKubecostNamespace()
 	w.Write([]byte(ns))
 }
 
+// logsFor pulls the logs for a specific pod, namespace, and container
 func logsFor(c kubernetes.Interface, namespace string, pod string, container string, dur time.Duration, ctx context.Context) (string, error) {
 	since := time.Now().UTC().Add(-dur)
 
@@ -1196,18 +1219,13 @@ func (a *Accesses) GetPodLogs(w http.ResponseWriter, r *http.Request, ps httprou
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	q := r.URL.Query()
-	ns := q.Get("namespace")
-	if ns == "" {
-		ns = os.Getenv("KUBECOST_NAMESPACE")
-	}
-	pod := q.Get("pod")
-	selector := q.Get("selector")
-	container := q.Get("container")
-	since := q.Get("since")
-	if since == "" {
-		since = "24h"
-	}
+	qp := httputil.NewQueryParams(r.URL.Query())
+
+	ns := qp.Get("namespace", env.GetKubecostNamespace())
+	pod := qp.Get("pod", "")
+	selector := qp.Get("selector", "")
+	container := qp.Get("container", "")
+	since := qp.Get("since", "24h")
 
 	sinceDuration, err := time.ParseDuration(since)
 	if err != nil {
@@ -1284,8 +1302,6 @@ func (a *Accesses) AddServiceKey(w http.ResponseWriter, r *http.Request, ps http
 
 	r.ParseForm()
 
-	//p.CloudProvider.AddServiceKey(r.PostForm)
-
 	key := r.PostForm.Get("key")
 	k := []byte(key)
 	err := ioutil.WriteFile("/var/configs/key.json", k, 0644)
@@ -1300,7 +1316,7 @@ func (a *Accesses) GetHelmValues(w http.ResponseWriter, r *http.Request, ps http
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	encodedValues := os.Getenv("HELM_VALUES")
+	encodedValues := env.Get("HELM_VALUES", "")
 	if encodedValues == "" {
 		fmt.Fprintf(w, "Values reporting disabled")
 		return
@@ -1319,14 +1335,15 @@ func (a *Accesses) Status(w http.ResponseWriter, r *http.Request, _ httprouter.P
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
+	promServer := env.GetPrometheusServerEndpoint()
+
 	api := prometheusAPI.NewAPI(a.PrometheusClient)
 	result, err := api.Config(r.Context())
 	if err != nil {
-
-		fmt.Fprintf(w, "Using Prometheus at "+os.Getenv("PROMETHEUS_SERVER_ENDPOINT")+". Error: "+err.Error())
+		fmt.Fprintf(w, "Using Prometheus at "+promServer+". Error: "+err.Error())
 	} else {
 
-		fmt.Fprintf(w, "Using Prometheus at "+os.Getenv("PROMETHEUS_SERVER_ENDPOINT")+". PrometheusConfig: "+result.YAML)
+		fmt.Fprintf(w, "Using Prometheus at "+promServer+". PrometheusConfig: "+result.YAML)
 	}
 }
 
@@ -1441,12 +1458,24 @@ func Initialize(additionalConfigWatchers ...*watcher.ConfigMapWatcher) *Accesses
 		panic(err.Error())
 	}
 
+	// Create ConfigFileManager for synchronization of shared configuration
+	confManager := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		BucketStoreConfig: env.GetKubecostConfigBucket(),
+		LocalConfigPath:   "/",
+	})
+
 	// Create Kubernetes Cluster Cache + Watchers
-	k8sCache := clustercache.NewKubernetesClusterCache(kubeClientset)
+	var k8sCache clustercache.ClusterCache
+	if env.IsClusterCacheFileEnabled() {
+		importLocation := confManager.ConfigFileAt("/var/configs/cluster-cache.json")
+		k8sCache = clustercache.NewClusterImporter(importLocation)
+	} else {
+		k8sCache = clustercache.NewKubernetesClusterCache(kubeClientset)
+	}
 	k8sCache.Run()
 
 	cloudProviderKey := env.GetCloudProviderAPIKey()
-	cloudProvider, err := cloud.NewProvider(k8sCache, cloudProviderKey)
+	cloudProvider, err := cloud.NewProvider(k8sCache, cloudProviderKey, confManager)
 	if err != nil {
 		panic(err.Error())
 	}
@@ -1506,13 +1535,21 @@ func Initialize(additionalConfigWatchers ...*watcher.ConfigMapWatcher) *Accesses
 		}
 	}
 
+	// ClusterInfo Provider to provide the cluster map with local and remote cluster data
+	var clusterInfoProvider clusters.ClusterInfoProvider
+	if env.IsClusterInfoFileEnabled() {
+		clusterInfoFile := confManager.ConfigFileAt("/var/configs/cluster-info.json")
+		clusterInfoProvider = NewConfiguredClusterInfoProvider(clusterInfoFile)
+	} else {
+		clusterInfoProvider = NewLocalClusterInfoProvider(kubeClientset, cloudProvider)
+	}
+
 	// Initialize ClusterMap for maintaining ClusterInfo by ClusterID
 	var clusterMap clusters.ClusterMap
-	localCIProvider := NewLocalClusterInfoProvider(kubeClientset, cloudProvider)
 	if thanosClient != nil {
-		clusterMap = clusters.NewClusterMap(thanosClient, localCIProvider, 10*time.Minute)
+		clusterMap = clusters.NewClusterMap(thanosClient, clusterInfoProvider, 10*time.Minute)
 	} else {
-		clusterMap = clusters.NewClusterMap(promCli, localCIProvider, 5*time.Minute)
+		clusterMap = clusters.NewClusterMap(promCli, clusterInfoProvider, 5*time.Minute)
 	}
 
 	// cache responses from model and aggregation for a default of 10 minutes;
@@ -1541,24 +1578,27 @@ func Initialize(additionalConfigWatchers ...*watcher.ConfigMapWatcher) *Accesses
 		pc = promCli
 	}
 	costModel := NewCostModel(pc, cloudProvider, k8sCache, clusterMap, scrapeInterval)
-	metricsEmitter := NewCostModelMetricsEmitter(promCli, k8sCache, cloudProvider, costModel)
+	metricsEmitter := NewCostModelMetricsEmitter(promCli, k8sCache, cloudProvider, clusterInfoProvider, costModel)
 
 	a := &Accesses{
-		Router:            httprouter.New(),
-		PrometheusClient:  promCli,
-		ThanosClient:      thanosClient,
-		KubeClientSet:     kubeClientset,
-		ClusterMap:        clusterMap,
-		CloudProvider:     cloudProvider,
-		Model:             costModel,
-		MetricsEmitter:    metricsEmitter,
-		AggregateCache:    aggregateCache,
-		CostDataCache:     costDataCache,
-		ClusterCostsCache: clusterCostsCache,
-		OutOfClusterCache: outOfClusterCache,
-		SettingsCache:     settingsCache,
-		CacheExpiration:   cacheExpiration,
-		httpServices:      services.NewCostModelServices(),
+		Router:              httprouter.New(),
+		PrometheusClient:    promCli,
+		ThanosClient:        thanosClient,
+		KubeClientSet:       kubeClientset,
+		ClusterCache:        k8sCache,
+		ClusterMap:          clusterMap,
+		CloudProvider:       cloudProvider,
+		ConfigFileManager:   confManager,
+		ClusterInfoProvider: clusterInfoProvider,
+		Model:               costModel,
+		MetricsEmitter:      metricsEmitter,
+		AggregateCache:      aggregateCache,
+		CostDataCache:       costDataCache,
+		ClusterCostsCache:   clusterCostsCache,
+		OutOfClusterCache:   outOfClusterCache,
+		SettingsCache:       settingsCache,
+		CacheExpiration:     cacheExpiration,
+		httpServices:        services.NewCostModelServices(),
 	}
 	// Use the Accesses instance, itself, as the CostModelAggregator. This is
 	// confusing and unconventional, but necessary so that we can swap it

+ 45 - 1
pkg/env/costmodelenv.go

@@ -6,6 +6,7 @@ import (
 	"time"
 
 	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/util/timeutil"
 )
 
 const (
@@ -75,8 +76,51 @@ const (
 
 	PricingConfigmapName  = "PRICING_CONFIGMAP_NAME"
 	KubecostJobNameEnvVar = "KUBECOST_JOB_NAME"
+
+	KubecostConfigBucketEnvVar    = "KUBECOST_CONFIG_BUCKET"
+	ClusterInfoFileEnabledEnvVar  = "CLUSTER_INFO_FILE_ENABLED"
+	ClusterCacheFileEnabledEnvVar = "CLUSTER_CACHE_FILE_ENABLED"
+	PrometheusQueryOffsetEnvVar   = "PROMETHEUS_QUERY_OFFSET"
 )
 
+// GetKubecostConfigBucket returns a file location for a mounted bucket configuration which is used to store
+// a subset of kubecost configurations that require sharing via remote storage.
+func GetKubecostConfigBucket() string {
+	return Get(KubecostConfigBucketEnvVar, "")
+}
+
+// IsClusterInfoFileEnabled returns true if the cluster info is read from a file or pulled from the local
+// cloud provider and kubernetes.
+func IsClusterInfoFileEnabled() bool {
+	return GetBool(ClusterInfoFileEnabledEnvVar, false)
+}
+
+// IsClusterCacheFileEnabled returns true if the kubernetes cluster data is read from a file or pulled from the local
+// kubernetes API.
+func IsClusterCacheFileEnabled() bool {
+	return GetBool(ClusterCacheFileEnabledEnvVar, false)
+}
+
+// GetPrometheusQueryOffset returns the time.Duration to offset all prometheus queries by. NOTE: This env var is applied
+// to all non-range queries made via our query context. This should only be applied when there is a significant delay in
+// data arriving in the target prom db. For example, if supplying a thanos or cortex querier for the prometheus server, using
+// a 3h offset will ensure that current time = current time - 3h.
+//
+// This offset is NOT the same as the GetThanosOffset() option, as that is only applied to queries made specifically targetting
+// thanos. This offset is applied globally.
+func GetPrometheusQueryOffset() time.Duration {
+	offset := Get(PrometheusQueryOffsetEnvVar, "")
+	if offset == "" {
+		return 0
+	}
+
+	dur, err := timeutil.ParseDuration(offset)
+	if err != nil {
+		return 0
+	}
+	return dur
+}
+
 func GetPricingConfigmapName() string {
 	return Get(PricingConfigmapName, "pricing-configs")
 }
@@ -84,7 +128,7 @@ func GetPricingConfigmapName() string {
 // GetAWSAccessKeyID returns the environment variable value for AWSAccessKeyIDEnvVar which represents
 // the AWS access key for authentication
 func GetAppVersion() string {
-	return Get(AppVersionEnvVar, "1.88.0")
+	return Get(AppVersionEnvVar, "1.89.0-rc.0")
 }
 
 // IsEmitNamespaceAnnotationsMetric returns true if cost-model is configured to emit the kube_namespace_annotations metric

+ 7 - 0
pkg/env/kubemetricsenv.go

@@ -3,6 +3,7 @@ package env
 const (
 	KubecostMetricsPodEnabledEnvVar = "KUBECOST_METRICS_POD_ENABLED"
 	KubecostMetricsPodPortEnvVar    = "KUBECOST_METRICS_PORT"
+	ExportClusterCacheEnabledEnvVar = "EXPORT_CLUSTER_CACHE_ENABLED"
 )
 
 func GetKubecostMetricsPort() int {
@@ -13,3 +14,9 @@ func GetKubecostMetricsPort() int {
 func IsKubecostMetricsPodEnabled() bool {
 	return GetBool(KubecostMetricsPodEnabledEnvVar, false)
 }
+
+// IsExportClusterCacheEnabled is set to true if the metrics pod should export the cluster cache
+// data to a target file location
+func IsExportClusterCacheEnabled() bool {
+	return GetBool(ExportClusterCacheEnabledEnvVar, false)
+}

+ 135 - 208
pkg/kubecost/allocation.go

@@ -350,46 +350,82 @@ func (a *Allocation) Equal(that *Allocation) bool {
 
 // TotalCost is the total cost of the Allocation including adjustments
 func (a *Allocation) TotalCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.CPUTotalCost() + a.GPUTotalCost() + a.RAMTotalCost() + a.PVTotalCost() + a.NetworkTotalCost() + a.LBTotalCost() + a.SharedTotalCost() + a.ExternalCost
 }
 
 // CPUTotalCost calculates total CPU cost of Allocation including adjustment
 func (a *Allocation) CPUTotalCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.CPUCost + a.CPUCostAdjustment
 }
 
 // GPUTotalCost calculates total GPU cost of Allocation including adjustment
 func (a *Allocation) GPUTotalCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.GPUCost + a.GPUCostAdjustment
 }
 
 // RAMTotalCost calculates total RAM cost of Allocation including adjustment
 func (a *Allocation) RAMTotalCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.RAMCost + a.RAMCostAdjustment
 }
 
 // PVTotalCost calculates total PV cost of Allocation including adjustment
 func (a *Allocation) PVTotalCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.PVCost() + a.PVCostAdjustment
 }
 
 // NetworkTotalCost calculates total Network cost of Allocation including adjustment
 func (a *Allocation) NetworkTotalCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.NetworkCost + a.NetworkCostAdjustment
 }
 
 // LBTotalCost calculates total LB cost of Allocation including adjustment
 func (a *Allocation) LBTotalCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.LoadBalancerCost + a.LoadBalancerCostAdjustment
 }
 
 // SharedTotalCost calculates total shared cost of Allocation including adjustment
 func (a *Allocation) SharedTotalCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.SharedCost
 }
 
 // PVCost calculate cumulative cost of all PVs that Allocation is attached to
 func (a *Allocation) PVCost() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	cost := 0.0
 	for _, pv := range a.PVs {
 		cost += pv.Cost
@@ -399,6 +435,10 @@ func (a *Allocation) PVCost() float64 {
 
 // PVByteHours calculate cumulative ByteHours of all PVs that Allocation is attached to
 func (a *Allocation) PVByteHours() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	byteHours := 0.0
 	for _, pv := range a.PVs {
 		byteHours += pv.ByteHours
@@ -410,6 +450,10 @@ func (a *Allocation) PVByteHours() float64 {
 // no usage or cost, then efficiency is zero. If there is no request, but there
 // is usage or cost, then efficiency is 100%.
 func (a *Allocation) CPUEfficiency() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	if a.CPUCoreRequestAverage > 0 {
 		return a.CPUCoreUsageAverage / a.CPUCoreRequestAverage
 	}
@@ -425,6 +469,10 @@ func (a *Allocation) CPUEfficiency() float64 {
 // no usage or cost, then efficiency is zero. If there is no request, but there
 // is usage or cost, then efficiency is 100%.
 func (a *Allocation) RAMEfficiency() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	if a.RAMBytesRequestAverage > 0 {
 		return a.RAMBytesUsageAverage / a.RAMBytesRequestAverage
 	}
@@ -439,6 +487,10 @@ func (a *Allocation) RAMEfficiency() float64 {
 // TotalEfficiency is the cost-weighted average of CPU and RAM efficiency. If
 // there is no cost at all, then efficiency is zero.
 func (a *Allocation) TotalEfficiency() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	if a.RAMTotalCost()+a.CPUTotalCost() > 0 {
 		ramCostEff := a.RAMEfficiency() * a.RAMTotalCost()
 		cpuCostEff := a.CPUEfficiency() * a.CPUTotalCost()
@@ -482,6 +534,10 @@ func (a *Allocation) PVBytes() float64 {
 
 // ResetAdjustments sets all cost adjustment fields to zero
 func (a *Allocation) ResetAdjustments() {
+	if a == nil {
+		return
+	}
+
 	a.CPUCostAdjustment = 0.0
 	a.GPUCostAdjustment = 0.0
 	a.RAMCostAdjustment = 0.0
@@ -550,27 +606,47 @@ func (a *Allocation) IsAggregated() bool {
 
 // IsExternal is true if the given Allocation represents external costs.
 func (a *Allocation) IsExternal() bool {
+	if a == nil {
+		return false
+	}
+
 	return strings.Contains(a.Name, ExternalSuffix)
 }
 
 // IsIdle is true if the given Allocation represents idle costs.
 func (a *Allocation) IsIdle() bool {
+	if a == nil {
+		return false
+	}
+
 	return strings.Contains(a.Name, IdleSuffix)
 }
 
 // IsUnallocated is true if the given Allocation represents unallocated costs.
 func (a *Allocation) IsUnallocated() bool {
+	if a == nil {
+		return false
+	}
+
 	return strings.Contains(a.Name, UnallocatedSuffix)
 }
 
 // IsUnmounted is true if the given Allocation represents unmounted volume costs.
 func (a *Allocation) IsUnmounted() bool {
+	if a == nil {
+		return false
+	}
+
 	return strings.Contains(a.Name, UnmountedSuffix)
 }
 
 // Minutes returns the number of minutes the Allocation represents, as defined
 // by the difference between the end and start times.
 func (a *Allocation) Minutes() float64 {
+	if a == nil {
+		return 0.0
+	}
+
 	return a.End.Sub(a.Start).Minutes()
 }
 
@@ -594,6 +670,10 @@ func (a *Allocation) Share(that *Allocation) (*Allocation, error) {
 
 // String represents the given Allocation as a string
 func (a *Allocation) String() string {
+	if a == nil {
+		return "<nil>"
+	}
+
 	return fmt.Sprintf("%s%s=%.2f", a.Name, NewWindow(&a.Start, &a.End), a.TotalCost())
 }
 
@@ -743,15 +823,16 @@ func NewAllocationSet(start, end time.Time, allocs ...*Allocation) *AllocationSe
 // succeeds, the allocation is marked as a shared resource. ShareIdle is a
 // simple flag for sharing idle resources.
 type AllocationAggregationOptions struct {
-	FilterFuncs       []AllocationMatchFunc
-	IdleByNode        bool
-	LabelConfig       *LabelConfig
-	MergeUnallocated  bool
-	SharedHourlyCosts map[string]float64
-	ShareFuncs        []AllocationMatchFunc
-	ShareIdle         string
-	ShareSplit        string
-	SplitIdle         bool
+	AllocationTotalsStore AllocationTotalsStore
+	FilterFuncs           []AllocationMatchFunc
+	IdleByNode            bool
+	LabelConfig           *LabelConfig
+	MergeUnallocated      bool
+	ShareFuncs            []AllocationMatchFunc
+	ShareIdle             string
+	ShareSplit            string
+	SharedHourlyCosts     map[string]float64
+	SplitIdle             bool
 }
 
 // AggregateBy aggregates the Allocations in the given AllocationSet by the given
@@ -759,33 +840,47 @@ type AllocationAggregationOptions struct {
 // given AllocationProperty; e.g. Containers can be divided by Namespace, but not vice-a-versa.
 func (as *AllocationSet) AggregateBy(aggregateBy []string, options *AllocationAggregationOptions) error {
 	// The order of operations for aggregating allocations is as follows:
+	//
 	//  1. Partition external, idle, and shared allocations into separate sets.
 	//     Also, create the aggSet into which the results will be aggregated.
+	//
 	//  2. Compute sharing coefficients for idle and shared resources
 	//     a) if idle allocation is to be shared, compute idle coefficients
 	//     b) if idle allocation is NOT shared, but filters are present, compute
 	//        idle filtration coefficients for the purpose of only returning the
 	//        portion of idle allocation that would have been shared with the
 	//        unfiltered results. (See unit tests 5.a,b,c)
-	//     c) generate shared allocation for then given shared overhead, which
+	//     c) generate shared allocation for them given shared overhead, which
 	//        must happen after (2a) and (2b)
 	//     d) if there are shared resources, compute share coefficients
+	//
 	//  3. Drop any allocation that fails any of the filters
+	//
 	//  4. Distribute idle allocations according to the idle coefficients
+	//
 	//  5. Generate aggregation key and insert allocation into the output set
+	//
 	//  6. If idle is shared and resources are shared, some idle might be shared
 	//     with a shared resource. Distribute that to the shared resources
 	//     prior to sharing them with the aggregated results.
+	//
 	//  7. Apply idle filtration coefficients from step (2b)
+	//
 	//  8. Distribute shared allocations according to the share coefficients.
+	//
 	//  9. If there are external allocations that can be aggregated into
 	//     the output (i.e. they can be used to generate a valid key for
 	//     the given properties) then aggregate; otherwise... ignore them?
+	//
 	// 10. If the merge idle option is enabled, merge any remaining idle
 	//     allocations into a single idle allocation. If there was any idle
 	//	   whose costs were not distributed because there was no usage of a
 	//     specific resource type, re-add the idle to the aggregation with
 	//     only that type.
+	//
+	// 11. Distribute any undistributed idle, in the case that idle
+	//     coefficients end up being zero and some idle is not shared.
+
 	if as.IsEmpty() {
 		return nil
 	}
@@ -963,7 +1058,7 @@ func (as *AllocationSet) AggregateBy(aggregateBy []string, options *AllocationAg
 	}
 
 	// (2c) Convert SharedHourlyCosts to Allocations in the shareSet. This must
-	// come after idle coefficients are computes so that allocations generated
+	// come after idle coefficients are computed so that allocations generated
 	// by shared overhead do not skew the idle coefficient computation.
 	for name, cost := range options.SharedHourlyCosts {
 		if cost > 0.0 {
@@ -1178,7 +1273,7 @@ func (as *AllocationSet) AggregateBy(aggregateBy []string, options *AllocationAg
 		for _, alloc := range aggSet.allocations {
 			for _, sharedAlloc := range shareSet.allocations {
 				if _, ok := shareCoefficients[alloc.Name]; !ok {
-					if !alloc.IsIdle() {
+					if !alloc.IsIdle() && !alloc.IsUnmounted() {
 						log.Warningf("AllocationSet.AggregateBy: error getting share coefficienct for '%s'", alloc.Name)
 					}
 					continue
@@ -1218,7 +1313,7 @@ func (as *AllocationSet) AggregateBy(aggregateBy []string, options *AllocationAg
 		}
 	}
 
-	// In the edge case that some idle has not been distributed because
+	// (11) In the edge case that some idle has not been distributed because
 	// there is no usage of that resource type, add idle back to
 	// aggregations with only that cost applied.
 
@@ -1238,42 +1333,36 @@ func (as *AllocationSet) AggregateBy(aggregateBy []string, options *AllocationAg
 	// __idle__ $0      $12     $0
 	// kubecost $12     $0      $7
 
-	if idleSet.Length() > 0 && !options.SplitIdle {
-		if undistributedIdleMap["cpu"] || undistributedIdleMap["gpu"] || undistributedIdleMap["ram"] {
-
-			for _, idleAlloc := range idleSet.allocations {
-
-				skip := false
-
-				// if the idle does not apply to the non-filtered values, skip it
-				for _, ff := range options.FilterFuncs {
-					if !ff(idleAlloc) {
-						skip = true
-						break
-					}
+	hasUndistributedIdle := undistributedIdleMap["cpu"] || undistributedIdleMap["gpu"] || undistributedIdleMap["ram"]
+	if idleSet.Length() > 0 && hasUndistributedIdle {
+		for _, idleAlloc := range idleSet.allocations {
+			// if the idle does not apply to the non-filtered values, skip it
+			skip := false
+			for _, ff := range options.FilterFuncs {
+				if !ff(idleAlloc) {
+					skip = true
+					break
 				}
+			}
+			if skip {
+				continue
+			}
 
-				if skip {
-					continue
+			// if the idle doesn't have a cost to be shared, also skip it
+			if idleAlloc.CPUCost != 0 && idleAlloc.GPUCost != 0 && idleAlloc.RAMCost != 0 {
+				// artificially set the already shared costs to zero
+				if !undistributedIdleMap["cpu"] {
+					idleAlloc.CPUCost = 0
 				}
-
-				// if the idle doesn't have a cost to be shared, also skip it
-				if idleAlloc.CPUCost != 0 && idleAlloc.GPUCost != 0 && idleAlloc.RAMCost != 0 {
-
-					// artificially set the already shared costs to zero
-					if !undistributedIdleMap["cpu"] {
-						idleAlloc.CPUCost = 0
-					}
-					if !undistributedIdleMap["gpu"] {
-						idleAlloc.GPUCost = 0
-					}
-					if !undistributedIdleMap["ram"] {
-						idleAlloc.RAMCost = 0
-					}
-
-					idleAlloc.Name = IdleSuffix
-					aggSet.Insert(idleAlloc)
+				if !undistributedIdleMap["gpu"] {
+					idleAlloc.GPUCost = 0
 				}
+				if !undistributedIdleMap["ram"] {
+					idleAlloc.RAMCost = 0
+				}
+
+				idleAlloc.Name = IdleSuffix
+				aggSet.Insert(idleAlloc)
 			}
 		}
 	}
@@ -1504,169 +1593,7 @@ func (a *Allocation) generateKey(aggregateBy []string, labelConfig *LabelConfig)
 		return ""
 	}
 
-	if labelConfig == nil {
-		labelConfig = NewLabelConfig()
-	}
-
-	// Names will ultimately be joined into a single name, which uniquely
-	// identifies allocations.
-	names := []string{}
-
-	for _, agg := range aggregateBy {
-		switch true {
-		case agg == AllocationClusterProp:
-			names = append(names, a.Properties.Cluster)
-		case agg == AllocationNodeProp:
-			names = append(names, a.Properties.Node)
-		case agg == AllocationNamespaceProp:
-			names = append(names, a.Properties.Namespace)
-		case agg == AllocationControllerKindProp:
-			controllerKind := a.Properties.ControllerKind
-			if controllerKind == "" {
-				// Indicate that allocation has no controller
-				controllerKind = UnallocatedSuffix
-			}
-			names = append(names, controllerKind)
-		case agg == AllocationDaemonSetProp || agg == AllocationStatefulSetProp || agg == AllocationDeploymentProp || agg == AllocationJobProp:
-			controller := a.Properties.Controller
-			if agg != a.Properties.ControllerKind || controller == "" {
-				// The allocation does not have the specified controller kind
-				controller = UnallocatedSuffix
-			}
-			names = append(names, controller)
-		case agg == AllocationControllerProp:
-			controller := a.Properties.Controller
-			if controller == "" {
-				// Indicate that allocation has no controller
-				controller = UnallocatedSuffix
-			} else if a.Properties.ControllerKind != "" {
-				controller = fmt.Sprintf("%s:%s", a.Properties.ControllerKind, controller)
-			}
-			names = append(names, controller)
-		case agg == AllocationPodProp:
-			names = append(names, a.Properties.Pod)
-		case agg == AllocationContainerProp:
-			names = append(names, a.Properties.Container)
-		case agg == AllocationServiceProp:
-			services := a.Properties.Services
-			if len(services) == 0 {
-				// Indicate that allocation has no services
-				names = append(names, UnallocatedSuffix)
-			} else {
-				// This just uses the first service
-				for _, service := range services {
-					names = append(names, service)
-					break
-				}
-			}
-		case strings.HasPrefix(agg, "label:"):
-			labels := a.Properties.Labels
-			if labels == nil {
-				names = append(names, UnallocatedSuffix)
-			} else {
-				labelName := labelConfig.Sanitize(strings.TrimPrefix(agg, "label:"))
-				if labelValue, ok := labels[labelName]; ok {
-					names = append(names, fmt.Sprintf("%s=%s", labelName, labelValue))
-				} else {
-					names = append(names, UnallocatedSuffix)
-				}
-			}
-		case strings.HasPrefix(agg, "annotation:"):
-			annotations := a.Properties.Annotations
-			if annotations == nil {
-				names = append(names, UnallocatedSuffix)
-			} else {
-				annotationName := labelConfig.Sanitize(strings.TrimPrefix(agg, "annotation:"))
-				if annotationValue, ok := annotations[annotationName]; ok {
-					names = append(names, fmt.Sprintf("%s=%s", annotationName, annotationValue))
-				} else {
-					names = append(names, UnallocatedSuffix)
-				}
-			}
-		case agg == AllocationDepartmentProp:
-			labels := a.Properties.Labels
-			if labels == nil {
-				names = append(names, UnallocatedSuffix)
-			} else {
-				labelNames := strings.Split(labelConfig.DepartmentLabel, ",")
-				for _, labelName := range labelNames {
-					labelName = labelConfig.Sanitize(labelName)
-					if labelValue, ok := labels[labelName]; ok {
-						names = append(names, labelValue)
-					} else {
-						names = append(names, UnallocatedSuffix)
-					}
-				}
-			}
-		case agg == AllocationEnvironmentProp:
-			labels := a.Properties.Labels
-			if labels == nil {
-				names = append(names, UnallocatedSuffix)
-			} else {
-				labelNames := strings.Split(labelConfig.EnvironmentLabel, ",")
-				for _, labelName := range labelNames {
-					labelName = labelConfig.Sanitize(labelName)
-					if labelValue, ok := labels[labelName]; ok {
-						names = append(names, labelValue)
-					} else {
-						names = append(names, UnallocatedSuffix)
-					}
-				}
-			}
-		case agg == AllocationOwnerProp:
-			labels := a.Properties.Labels
-			if labels == nil {
-				names = append(names, UnallocatedSuffix)
-			} else {
-				labelNames := strings.Split(labelConfig.OwnerLabel, ",")
-				for _, labelName := range labelNames {
-					labelName = labelConfig.Sanitize(labelName)
-					if labelValue, ok := labels[labelName]; ok {
-						names = append(names, labelValue)
-					} else {
-						names = append(names, UnallocatedSuffix)
-					}
-				}
-			}
-		case agg == AllocationProductProp:
-			labels := a.Properties.Labels
-			if labels == nil {
-				names = append(names, UnallocatedSuffix)
-			} else {
-				labelNames := strings.Split(labelConfig.ProductLabel, ",")
-				for _, labelName := range labelNames {
-					labelName = labelConfig.Sanitize(labelName)
-					if labelValue, ok := labels[labelName]; ok {
-						names = append(names, labelValue)
-					} else {
-						names = append(names, UnallocatedSuffix)
-					}
-				}
-			}
-		case agg == AllocationTeamProp:
-			labels := a.Properties.Labels
-			if labels == nil {
-				names = append(names, UnallocatedSuffix)
-			} else {
-				labelNames := strings.Split(labelConfig.TeamLabel, ",")
-				for _, labelName := range labelNames {
-					labelName = labelConfig.Sanitize(labelName)
-					if labelValue, ok := labels[labelName]; ok {
-						names = append(names, labelValue)
-					} else {
-						names = append(names, UnallocatedSuffix)
-					}
-				}
-			}
-		default:
-			// This case should never be reached, as input up until this point
-			// should be checked and rejected if invalid. But if we do get a
-			// value we don't recognize, log a warning.
-			log.Warningf("AggregateBy: illegal aggregation parameter: %s", agg)
-		}
-	}
-
-	return strings.Join(names, "/")
+	return a.Properties.GenerateKey(aggregateBy, labelConfig)
 }
 
 // Clone returns a new AllocationSet with a deep copy of the given

+ 179 - 7
pkg/kubecost/allocationprops.go

@@ -5,6 +5,7 @@ import (
 	"sort"
 	"strings"
 
+	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/prom"
 )
 
@@ -128,9 +129,7 @@ func (p *AllocationProperties) Clone() *AllocationProperties {
 	clone.ProviderID = p.ProviderID
 
 	var services []string
-	for _, s := range p.Services {
-		services = append(services, s)
-	}
+	services = append(services, p.Services...)
 	clone.Services = services
 
 	labels := make(map[string]string, len(p.Labels))
@@ -229,6 +228,179 @@ func (p *AllocationProperties) Equal(that *AllocationProperties) bool {
 	return true
 }
 
+// GenerateKey generates a string that represents the key by which the
+// AllocationProperties should be aggregated, given the properties defined by
+// the aggregateBy parameter and the given label configuration.
+func (p *AllocationProperties) GenerateKey(aggregateBy []string, labelConfig *LabelConfig) string {
+	if p == nil {
+		return ""
+	}
+
+	if labelConfig == nil {
+		labelConfig = NewLabelConfig()
+	}
+
+	// Names will ultimately be joined into a single name, which uniquely
+	// identifies allocations.
+	names := []string{}
+
+	for _, agg := range aggregateBy {
+		switch true {
+		case agg == AllocationClusterProp:
+			names = append(names, p.Cluster)
+		case agg == AllocationNodeProp:
+			names = append(names, p.Node)
+		case agg == AllocationNamespaceProp:
+			names = append(names, p.Namespace)
+		case agg == AllocationControllerKindProp:
+			controllerKind := p.ControllerKind
+			if controllerKind == "" {
+				// Indicate that allocation has no controller
+				controllerKind = UnallocatedSuffix
+			}
+			names = append(names, controllerKind)
+		case agg == AllocationDaemonSetProp || agg == AllocationStatefulSetProp || agg == AllocationDeploymentProp || agg == AllocationJobProp:
+			controller := p.Controller
+			if agg != p.ControllerKind || controller == "" {
+				// The allocation does not have the specified controller kind
+				controller = UnallocatedSuffix
+			}
+			names = append(names, controller)
+		case agg == AllocationControllerProp:
+			controller := p.Controller
+			if controller == "" {
+				// Indicate that allocation has no controller
+				controller = UnallocatedSuffix
+			} else if p.ControllerKind != "" {
+				controller = fmt.Sprintf("%s:%s", p.ControllerKind, controller)
+			}
+			names = append(names, controller)
+		case agg == AllocationPodProp:
+			names = append(names, p.Pod)
+		case agg == AllocationContainerProp:
+			names = append(names, p.Container)
+		case agg == AllocationServiceProp:
+			services := p.Services
+			if len(services) == 0 {
+				// Indicate that allocation has no services
+				names = append(names, UnallocatedSuffix)
+			} else {
+				// This just uses the first service
+				for _, service := range services {
+					names = append(names, service)
+					break
+				}
+			}
+		case strings.HasPrefix(agg, "label:"):
+			labels := p.Labels
+			if labels == nil {
+				names = append(names, UnallocatedSuffix)
+			} else {
+				labelName := labelConfig.Sanitize(strings.TrimPrefix(agg, "label:"))
+				if labelValue, ok := labels[labelName]; ok {
+					names = append(names, fmt.Sprintf("%s=%s", labelName, labelValue))
+				} else {
+					names = append(names, UnallocatedSuffix)
+				}
+			}
+		case strings.HasPrefix(agg, "annotation:"):
+			annotations := p.Annotations
+			if annotations == nil {
+				names = append(names, UnallocatedSuffix)
+			} else {
+				annotationName := labelConfig.Sanitize(strings.TrimPrefix(agg, "annotation:"))
+				if annotationValue, ok := annotations[annotationName]; ok {
+					names = append(names, fmt.Sprintf("%s=%s", annotationName, annotationValue))
+				} else {
+					names = append(names, UnallocatedSuffix)
+				}
+			}
+		case agg == AllocationDepartmentProp:
+			labels := p.Labels
+			if labels == nil {
+				names = append(names, UnallocatedSuffix)
+			} else {
+				labelNames := strings.Split(labelConfig.DepartmentLabel, ",")
+				for _, labelName := range labelNames {
+					labelName = labelConfig.Sanitize(labelName)
+					if labelValue, ok := labels[labelName]; ok {
+						names = append(names, labelValue)
+					} else {
+						names = append(names, UnallocatedSuffix)
+					}
+				}
+			}
+		case agg == AllocationEnvironmentProp:
+			labels := p.Labels
+			if labels == nil {
+				names = append(names, UnallocatedSuffix)
+			} else {
+				labelNames := strings.Split(labelConfig.EnvironmentLabel, ",")
+				for _, labelName := range labelNames {
+					labelName = labelConfig.Sanitize(labelName)
+					if labelValue, ok := labels[labelName]; ok {
+						names = append(names, labelValue)
+					} else {
+						names = append(names, UnallocatedSuffix)
+					}
+				}
+			}
+		case agg == AllocationOwnerProp:
+			labels := p.Labels
+			if labels == nil {
+				names = append(names, UnallocatedSuffix)
+			} else {
+				labelNames := strings.Split(labelConfig.OwnerLabel, ",")
+				for _, labelName := range labelNames {
+					labelName = labelConfig.Sanitize(labelName)
+					if labelValue, ok := labels[labelName]; ok {
+						names = append(names, labelValue)
+					} else {
+						names = append(names, UnallocatedSuffix)
+					}
+				}
+			}
+		case agg == AllocationProductProp:
+			labels := p.Labels
+			if labels == nil {
+				names = append(names, UnallocatedSuffix)
+			} else {
+				labelNames := strings.Split(labelConfig.ProductLabel, ",")
+				for _, labelName := range labelNames {
+					labelName = labelConfig.Sanitize(labelName)
+					if labelValue, ok := labels[labelName]; ok {
+						names = append(names, labelValue)
+					} else {
+						names = append(names, UnallocatedSuffix)
+					}
+				}
+			}
+		case agg == AllocationTeamProp:
+			labels := p.Labels
+			if labels == nil {
+				names = append(names, UnallocatedSuffix)
+			} else {
+				labelNames := strings.Split(labelConfig.TeamLabel, ",")
+				for _, labelName := range labelNames {
+					labelName = labelConfig.Sanitize(labelName)
+					if labelValue, ok := labels[labelName]; ok {
+						names = append(names, labelValue)
+					} else {
+						names = append(names, UnallocatedSuffix)
+					}
+				}
+			}
+		default:
+			// This case should never be reached, as input up until this point
+			// should be checked and rejected if invalid. But if we do get a
+			// value we don't recognize, log a warning.
+			log.Warningf("generateKey: illegal aggregation parameter: %s", agg)
+		}
+	}
+
+	return strings.Join(names, "/")
+}
+
 // Intersection returns an *AllocationProperties which contains all matching fields between the calling and parameter AllocationProperties
 // nillable slices and maps are left as nil
 func (p *AllocationProperties) Intersection(that *AllocationProperties) *AllocationProperties {
@@ -310,13 +482,13 @@ func (p *AllocationProperties) String() string {
 	for k, prop := range p.Labels {
 		labelStrs = append(labelStrs, fmt.Sprintf("%s:%s", k, prop))
 	}
-	strs = append(strs, fmt.Sprintf("Labels:{%s}", strings.Join(strs, ",")))
+	strs = append(strs, fmt.Sprintf("Labels:{%s}", strings.Join(labelStrs, ",")))
 
-	var AnnotationStrs []string
+	var annotationStrs []string
 	for k, prop := range p.Annotations {
-		AnnotationStrs = append(AnnotationStrs, fmt.Sprintf("%s:%s", k, prop))
+		annotationStrs = append(annotationStrs, fmt.Sprintf("%s:%s", k, prop))
 	}
-	strs = append(strs, fmt.Sprintf("Annotations:{%s}", strings.Join(strs, ",")))
+	strs = append(strs, fmt.Sprintf("Annotations:{%s}", strings.Join(annotationStrs, ",")))
 
 	return fmt.Sprintf("{%s}", strings.Join(strs, "; "))
 }

+ 3 - 0
pkg/kubecost/assetprops.go

@@ -24,6 +24,9 @@ const (
 	// AssetNameProp describes the name of the Asset
 	AssetNameProp AssetProperty = "name"
 
+	// AssetNodeProp describes the node of the Asset
+	AssetNodeProp AssetProperty = "node"
+
 	// AssetProjectProp describes the project of the Asset
 	AssetProjectProp AssetProperty = "project"
 

+ 1256 - 0
pkg/kubecost/summaryallocation.go

@@ -0,0 +1,1256 @@
+package kubecost
+
+import (
+	"errors"
+	"fmt"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/log"
+)
+
+// SummaryAllocation summarizes an Allocation, keeping only fields necessary
+// for providing a high-level view of identifying the Allocation over a period
+// of time (Start, End) over which it ran, and inspecting the associated per-
+// resource costs (subtotaled with adjustments), total cost, and efficiency.
+//
+// SummaryAllocation does not have a concept of Window (i.e. the time period
+// within which it is defined, as opposed to the Start and End times). That
+// context must be provided by a SummaryAllocationSet.
+type SummaryAllocation struct {
+	Name                   string                `json:"name"`
+	Properties             *AllocationProperties `json:"-"`
+	Start                  time.Time             `json:"start"`
+	End                    time.Time             `json:"end"`
+	CPUCoreRequestAverage  float64               `json:"cpuCoreRequestAverage"`
+	CPUCoreUsageAverage    float64               `json:"cpuCoreUsageAverage"`
+	CPUCost                float64               `json:"cpuCost"`
+	GPUCost                float64               `json:"gpuCost"`
+	NetworkCost            float64               `json:"networkCost"`
+	LoadBalancerCost       float64               `json:"loadBalancerCost"`
+	PVCost                 float64               `json:"pvCost"`
+	RAMBytesRequestAverage float64               `json:"ramByteRequestAverage"`
+	RAMBytesUsageAverage   float64               `json:"ramByteUsageAverage"`
+	RAMCost                float64               `json:"ramCost"`
+	SharedCost             float64               `json:"sharedCost"`
+	ExternalCost           float64               `json:"externalCost"`
+	Share                  bool                  `json:"-"`
+}
+
+// NewSummaryAllocation converts an Allocation to a SummaryAllocation by
+// dropping unnecessary fields and consolidating others (e.g. adjustments).
+// Reconciliation happens here because that process is synonymous with the
+// consolidation of adjustment fields.
+func NewSummaryAllocation(alloc *Allocation, reconcile, reconcileNetwork bool) *SummaryAllocation {
+	if alloc == nil {
+		return nil
+	}
+
+	sa := &SummaryAllocation{
+		Name:                   alloc.Name,
+		Properties:             alloc.Properties.Clone(),
+		Start:                  alloc.Start,
+		End:                    alloc.End,
+		CPUCoreRequestAverage:  alloc.CPUCoreRequestAverage,
+		CPUCoreUsageAverage:    alloc.CPUCoreUsageAverage,
+		CPUCost:                alloc.CPUCost + alloc.CPUCostAdjustment,
+		GPUCost:                alloc.GPUCost + alloc.GPUCostAdjustment,
+		NetworkCost:            alloc.NetworkCost + alloc.NetworkCostAdjustment,
+		LoadBalancerCost:       alloc.LoadBalancerCost + alloc.LoadBalancerCostAdjustment,
+		PVCost:                 alloc.PVCost() + alloc.PVCostAdjustment,
+		RAMBytesRequestAverage: alloc.RAMBytesRequestAverage,
+		RAMBytesUsageAverage:   alloc.RAMBytesUsageAverage,
+		RAMCost:                alloc.RAMCost + alloc.RAMCostAdjustment,
+		SharedCost:             alloc.SharedCost,
+		ExternalCost:           alloc.ExternalCost,
+	}
+
+	// Revert adjustments if reconciliation is off. If only network
+	// reconciliation is off, only revert network adjustment.
+	if !reconcile {
+		sa.CPUCost -= alloc.CPUCostAdjustment
+		sa.GPUCost -= alloc.GPUCostAdjustment
+		sa.NetworkCost -= alloc.NetworkCostAdjustment
+		sa.LoadBalancerCost -= alloc.LoadBalancerCostAdjustment
+		sa.PVCost -= alloc.PVCostAdjustment
+		sa.RAMCost -= alloc.RAMCostAdjustment
+	} else if !reconcileNetwork {
+		sa.NetworkCost -= alloc.NetworkCostAdjustment
+	}
+
+	return sa
+}
+
+// Add sums two SummaryAllocations, adding the given SummaryAllocation to the
+// receiving one, thus mutating the receiver. For performance reasons, it
+// simply drops Properties, so a SummaryAllocation can only be Added once.
+func (sa *SummaryAllocation) Add(that *SummaryAllocation) error {
+	if sa == nil || that == nil {
+		return errors.New("cannot Add a nil SummaryAllocation")
+	}
+
+	// Once Added, a SummaryAllocation has no Properties. This saves us from
+	// having to compute the intersection of two sets of Properties, which is
+	// expensive.
+	sa.Properties = nil
+
+	// Sum non-cumulative fields by turning them into cumulative, adding them,
+	// and then converting them back into averages after minutes have been
+	// combined (just below).
+	cpuReqCoreMins := sa.CPUCoreRequestAverage * sa.Minutes()
+	cpuReqCoreMins += that.CPUCoreRequestAverage * that.Minutes()
+
+	cpuUseCoreMins := sa.CPUCoreUsageAverage * sa.Minutes()
+	cpuUseCoreMins += that.CPUCoreUsageAverage * that.Minutes()
+
+	ramReqByteMins := sa.RAMBytesRequestAverage * sa.Minutes()
+	ramReqByteMins += that.RAMBytesRequestAverage * that.Minutes()
+
+	ramUseByteMins := sa.RAMBytesUsageAverage * sa.Minutes()
+	ramUseByteMins += that.RAMBytesUsageAverage * that.Minutes()
+
+	// Expand Start and End to be the "max" of among the given Allocations
+	if that.Start.Before(sa.Start) {
+		sa.Start = that.Start
+	}
+	if that.End.After(sa.End) {
+		sa.End = that.End
+	}
+
+	// Convert cumulative request and usage back into rates
+	if sa.Minutes() > 0 {
+		sa.CPUCoreRequestAverage = cpuReqCoreMins / sa.Minutes()
+		sa.CPUCoreUsageAverage = cpuUseCoreMins / sa.Minutes()
+		sa.RAMBytesRequestAverage = ramReqByteMins / sa.Minutes()
+		sa.RAMBytesUsageAverage = ramUseByteMins / sa.Minutes()
+	} else {
+		sa.CPUCoreRequestAverage = 0.0
+		sa.CPUCoreUsageAverage = 0.0
+		sa.RAMBytesRequestAverage = 0.0
+		sa.RAMBytesUsageAverage = 0.0
+	}
+
+	// Sum all cumulative cost fields
+	sa.CPUCost += that.CPUCost
+	sa.ExternalCost += that.ExternalCost
+	sa.GPUCost += that.GPUCost
+	sa.LoadBalancerCost += that.LoadBalancerCost
+	sa.NetworkCost += that.NetworkCost
+	sa.PVCost += that.PVCost
+	sa.RAMCost += that.RAMCost
+	sa.SharedCost += that.SharedCost
+
+	return nil
+}
+
+// Clone copies the SummaryAllocation and returns the copy
+func (sa *SummaryAllocation) Clone() *SummaryAllocation {
+	return &SummaryAllocation{
+		Name:                   sa.Name,
+		Properties:             sa.Properties.Clone(),
+		Start:                  sa.Start,
+		End:                    sa.End,
+		CPUCoreRequestAverage:  sa.CPUCoreRequestAverage,
+		CPUCoreUsageAverage:    sa.CPUCoreUsageAverage,
+		CPUCost:                sa.CPUCost,
+		GPUCost:                sa.GPUCost,
+		NetworkCost:            sa.NetworkCost,
+		LoadBalancerCost:       sa.LoadBalancerCost,
+		PVCost:                 sa.PVCost,
+		RAMBytesRequestAverage: sa.RAMBytesRequestAverage,
+		RAMBytesUsageAverage:   sa.RAMBytesUsageAverage,
+		RAMCost:                sa.RAMCost,
+		SharedCost:             sa.SharedCost,
+		ExternalCost:           sa.ExternalCost,
+	}
+}
+
+// CPUEfficiency is the ratio of usage to request. If there is no request and
+// no usage or cost, then efficiency is zero. If there is no request, but there
+// is usage or cost, then efficiency is 100%.
+func (sa *SummaryAllocation) CPUEfficiency() float64 {
+	if sa == nil {
+		return 0.0
+	}
+
+	if sa.CPUCoreRequestAverage > 0 {
+		return sa.CPUCoreUsageAverage / sa.CPUCoreRequestAverage
+	}
+
+	if sa.CPUCoreUsageAverage == 0.0 || sa.CPUCost == 0.0 {
+		return 0.0
+	}
+
+	return 1.0
+}
+
+func (sa *SummaryAllocation) generateKey(aggregateBy []string, labelConfig *LabelConfig) string {
+	if sa == nil {
+		return ""
+	}
+
+	return sa.Properties.GenerateKey(aggregateBy, labelConfig)
+}
+
+// IsExternal is true if the given SummaryAllocation represents external costs.
+func (sa *SummaryAllocation) IsExternal() bool {
+	if sa == nil {
+		return false
+	}
+
+	return strings.Contains(sa.Name, ExternalSuffix)
+}
+
+// IsIdle is true if the given SummaryAllocation represents idle costs.
+func (sa *SummaryAllocation) IsIdle() bool {
+	if sa == nil {
+		return false
+	}
+
+	return strings.Contains(sa.Name, IdleSuffix)
+}
+
+// IsUnallocated is true if the given SummaryAllocation represents unallocated
+// costs.
+func (sa *SummaryAllocation) IsUnallocated() bool {
+	if sa == nil {
+		return false
+	}
+
+	return strings.Contains(sa.Name, UnallocatedSuffix)
+}
+
+// IsUnmounted is true if the given SummaryAllocation represents unmounted
+// volume costs.
+func (sa *SummaryAllocation) IsUnmounted() bool {
+	if sa == nil {
+		return false
+	}
+
+	return strings.Contains(sa.Name, UnmountedSuffix)
+}
+
+// Minutes returns the number of minutes the SummaryAllocation represents, as
+// defined by the difference between the end and start times.
+func (sa *SummaryAllocation) Minutes() float64 {
+	if sa == nil {
+		return 0.0
+	}
+
+	return sa.End.Sub(sa.Start).Minutes()
+}
+
+// RAMEfficiency is the ratio of usage to request. If there is no request and
+// no usage or cost, then efficiency is zero. If there is no request, but there
+// is usage or cost, then efficiency is 100%.
+func (sa *SummaryAllocation) RAMEfficiency() float64 {
+	if sa == nil {
+		return 0.0
+	}
+
+	if sa.RAMBytesRequestAverage > 0 {
+		return sa.RAMBytesUsageAverage / sa.RAMBytesRequestAverage
+	}
+
+	if sa.RAMBytesUsageAverage == 0.0 || sa.RAMCost == 0.0 {
+		return 0.0
+	}
+
+	return 1.0
+}
+
+// TotalCost is the total cost of the SummaryAllocation
+func (sa *SummaryAllocation) TotalCost() float64 {
+	if sa == nil {
+		return 0.0
+	}
+
+	return sa.CPUCost + sa.GPUCost + sa.RAMCost + sa.PVCost + sa.NetworkCost + sa.LoadBalancerCost + sa.SharedCost + sa.ExternalCost
+}
+
+// TotalEfficiency is the cost-weighted average of CPU and RAM efficiency. If
+// there is no cost at all, then efficiency is zero.
+func (sa *SummaryAllocation) TotalEfficiency() float64 {
+	if sa == nil {
+		return 0.0
+	}
+
+	if sa.RAMCost+sa.CPUCost > 0 {
+		ramCostEff := sa.RAMEfficiency() * sa.RAMCost
+		cpuCostEff := sa.CPUEfficiency() * sa.CPUCost
+		return (ramCostEff + cpuCostEff) / (sa.CPUCost + sa.RAMCost)
+	}
+
+	return 0.0
+}
+
+// SummaryAllocationSet stores a set of SummaryAllocations, each with a unique
+// name, that share a window. An AllocationSet is mutable, so treat it like a
+// threadsafe map.
+type SummaryAllocationSet struct {
+	sync.RWMutex
+	externalKeys       map[string]bool
+	idleKeys           map[string]bool
+	SummaryAllocations map[string]*SummaryAllocation `json:"allocations"`
+	Window             Window                        `json:"window"`
+}
+
+// NewSummaryAllocationSet converts an AllocationSet to a SummaryAllocationSet.
+// Filter functions, sharing functions, and reconciliation parameters are
+// required for unfortunate reasons to do with performance and legacy order-of-
+// operations details, as well as the fact that reconciliation has been
+// pushed down to the conversion step between Allocation and SummaryAllocation.
+func NewSummaryAllocationSet(as *AllocationSet, ffs, sfs []AllocationMatchFunc, reconcile, reconcileNetwork bool) *SummaryAllocationSet {
+	if as == nil {
+		return nil
+	}
+
+	// If we can know the exact size of the map, use it. If filters or sharing
+	// functions are present, we can't know the size, so we make a default map.
+	var sasMap map[string]*SummaryAllocation
+	if len(ffs) == 0 && len(sfs) == 0 {
+		// No filters, so make the map of summary allocations exactly the size
+		// of the origin allocation set.
+		sasMap = make(map[string]*SummaryAllocation, len(as.allocations))
+	} else {
+		// There are filters, so start with a standard map
+		sasMap = make(map[string]*SummaryAllocation)
+	}
+
+	sas := &SummaryAllocationSet{
+		SummaryAllocations: sasMap,
+		Window:             as.Window.Clone(),
+	}
+
+	for _, alloc := range as.allocations {
+		// First, detect if the allocation should be shared. If so, mark it as
+		// such, insert it, and continue.
+		shouldShare := false
+		for _, sf := range sfs {
+			if sf(alloc) {
+				shouldShare = true
+				break
+			}
+		}
+		if shouldShare {
+			sa := NewSummaryAllocation(alloc, reconcile, reconcileNetwork)
+			sa.Share = true
+			sas.Insert(sa)
+			continue
+		}
+
+		// If the allocation does not pass any of the given filter functions,
+		// do not insert it into the set.
+		shouldFilter := false
+		for _, ff := range ffs {
+			if !ff(alloc) {
+				shouldFilter = true
+				break
+			}
+		}
+		if shouldFilter {
+			continue
+
+		}
+
+		err := sas.Insert(NewSummaryAllocation(alloc, reconcile, reconcileNetwork))
+		if err != nil {
+			log.Errorf("SummaryAllocation: error inserting summary of %s", alloc.Name)
+		}
+	}
+
+	for key := range as.externalKeys {
+		sas.externalKeys[key] = true
+	}
+
+	for key := range as.idleKeys {
+		sas.idleKeys[key] = true
+	}
+
+	return sas
+}
+
+// Add sums two SummaryAllocationSets, which Adds all SummaryAllocations in the
+// given SummaryAllocationSet to thier counterparts in the receiving set. Add
+// also expands the Window to include both constituent Windows, in the case
+// that Add is being used from accumulating (as opposed to aggregating). For
+// performance reasons, the function may return either a new set, or an
+// unmodified original, so it should not be assumed that the original sets are
+// safeuly usable after calling Add.
+func (sas *SummaryAllocationSet) Add(that *SummaryAllocationSet) (*SummaryAllocationSet, error) {
+	if sas == nil || len(sas.SummaryAllocations) == 0 {
+		return that, nil
+	}
+
+	if that == nil || len(that.SummaryAllocations) == 0 {
+		return sas, nil
+	}
+
+	if sas.Window.IsOpen() {
+		return nil, errors.New("cannot add a SummaryAllocationSet with an open window")
+	}
+
+	// Set start, end to min(start), max(end)
+	start := *sas.Window.Start()
+	end := *sas.Window.End()
+	if that.Window.Start().Before(start) {
+		start = *that.Window.Start()
+	}
+	if that.Window.End().After(end) {
+		end = *that.Window.End()
+	}
+
+	acc := &SummaryAllocationSet{
+		SummaryAllocations: make(map[string]*SummaryAllocation, len(sas.SummaryAllocations)),
+		Window:             NewClosedWindow(start, end),
+	}
+
+	sas.RLock()
+	defer sas.RUnlock()
+
+	that.RLock()
+	defer that.RUnlock()
+
+	for _, alloc := range sas.SummaryAllocations {
+		err := acc.Insert(alloc)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	for _, alloc := range that.SummaryAllocations {
+		err := acc.Insert(alloc)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return acc, nil
+}
+
+// AggregateBy aggregates the Allocations in the given AllocationSet by the given
+// AllocationProperty. This will only be legal if the AllocationSet is divisible by the
+// given AllocationProperty; e.g. Containers can be divided by Namespace, but not vice-a-versa.
+func (sas *SummaryAllocationSet) AggregateBy(aggregateBy []string, options *AllocationAggregationOptions) error {
+	if sas == nil || len(sas.SummaryAllocations) == 0 {
+		return nil
+	}
+
+	if sas.Window.IsOpen() {
+		return errors.New("cannot aggregate a SummaryAllocationSet with an open window")
+	}
+
+	if options == nil {
+		options = &AllocationAggregationOptions{}
+	}
+
+	if options.LabelConfig == nil {
+		options.LabelConfig = NewLabelConfig()
+	}
+
+	// Check if we have any work to do; if not, then early return. If
+	// aggregateBy is nil, we don't aggregate anything. On the other hand,
+	// an empty slice implies that we should aggregate everything. (See
+	// generateKey for why that makes sense.)
+	shouldAggregate := aggregateBy != nil
+	shouldShare := len(options.SharedHourlyCosts) > 0 || len(options.ShareFuncs) > 0
+	if !shouldAggregate && !shouldShare {
+		return nil
+	}
+
+	// The order of operations for aggregating a SummaryAllotionSet is as
+	// follows:
+	//
+	//  1. Partition external, idle, and shared allocations into separate sets.
+	//     Also, create the resultSet into which the results will be aggregated.
+	//
+	//  2. Record resource totals for shared costs and unmounted volumes so
+	//     that we can account for them in computing idle coefficients.
+	//
+	//  3. Retrieve pre-computed allocation resource totals, which will be used
+	//     to compute idle sharing coefficients.
+	//
+	//  4. Convert shared hourly cost into a cumulative allocation to share,
+	//     and insert it into the share set.
+	//
+	//  5. Compute sharing coefficients per-aggregation, if sharing resources.
+	//
+	//  6. Distribute idle allocations according to the idle coefficients.
+	//
+	//  7. Record allocation resource totals (after filtration) if filters have
+	//     been applied. (Used for filtering proportional amount of idle.)
+	//
+	//  8. Generate aggregation key and insert allocation into the output set
+	//
+	//  9. If idle is shared and resources are shared, it's probable that some
+	//     amount of idle cost will be shared with a shared resource.
+	//     Distribute that idle cost, if it exists, among the respective shared
+	//     allocations before sharing them with the aggregated allocations.
+	//
+	// 10. Apply idle filtration, which "filters" the idle cost, or scales it
+	//     by the proportion of allocation resources remaining after filters
+	//     have been applied.
+	//
+	//
+	// 11. Distribute shared resources according to sharing coefficients.
+	//
+	// 12. Insert external allocations into the result set.
+	//
+	// 13. Insert any undistributed idle, in the case that idle
+	//     coefficients end up being zero and some idle is not shared.
+	//
+	// 14. Combine all idle allocations into a single idle allocation, unless
+	//     the option to keep idle split by cluster or node is enabled.
+
+	// 1. Partition external, idle, and shared allocations into separate sets.
+	// Also, create the resultSet into which the results will be aggregated.
+
+	// resultSet will collect the aggregated allocations
+	resultSet := &SummaryAllocationSet{
+		Window: sas.Window.Clone(),
+	}
+
+	// externalSet will collect external allocations
+	externalSet := &SummaryAllocationSet{
+		Window: sas.Window.Clone(),
+	}
+
+	// idleSet will be shared among resultSet after initial aggregation
+	// is complete
+	idleSet := &SummaryAllocationSet{
+		Window: sas.Window.Clone(),
+	}
+
+	// shareSet will be shared among resultSet after initial aggregation
+	// is complete
+	shareSet := &SummaryAllocationSet{
+		Window: sas.Window.Clone(),
+	}
+
+	sas.Lock()
+	defer sas.Unlock()
+
+	// 2. Record resource totals for shared costs, aggregating by cluster or by
+	// node (depending on if idle is partitioned by cluster or node) so that we
+	// can account for them in computing idle coefficients. Do the same for
+	// unmounted volume costs, which only require a total cost.
+	sharedResourceTotals := map[string]*AllocationTotals{}
+	totalUnmountedCost := 0.0
+
+	// 1 & 2. Identify set membership and aggregate aforementioned totals.
+	for _, sa := range sas.SummaryAllocations {
+		if sa.Share {
+			var key string
+			if options.IdleByNode {
+				key = fmt.Sprintf("%s/%s", sa.Properties.Cluster, sa.Properties.Node)
+			} else {
+				key = sa.Properties.Cluster
+			}
+
+			if _, ok := sharedResourceTotals[key]; !ok {
+				sharedResourceTotals[key] = &AllocationTotals{}
+			}
+			sharedResourceTotals[key].CPUCost += sa.CPUCost
+			sharedResourceTotals[key].GPUCost += sa.GPUCost
+			sharedResourceTotals[key].LoadBalancerCost += sa.LoadBalancerCost
+			sharedResourceTotals[key].NetworkCost += sa.NetworkCost
+			sharedResourceTotals[key].PersistentVolumeCost += sa.PVCost
+			sharedResourceTotals[key].RAMCost += sa.RAMCost
+
+			shareSet.Insert(sa)
+			delete(sas.SummaryAllocations, sa.Name)
+
+			continue
+		}
+
+		// External allocations get aggregated post-hoc (see step 6) and do
+		// not necessarily contain complete sets of properties, so they are
+		// moved to a separate AllocationSet.
+		if sa.IsExternal() {
+			delete(sas.externalKeys, sa.Name)
+			delete(sas.SummaryAllocations, sa.Name)
+			externalSet.Insert(sa)
+			continue
+		}
+
+		// Idle allocations should be separated into idleSet if they are to be
+		// shared later on. If they are not to be shared, then add them to the
+		// resultSet like any other allocation.
+		if sa.IsIdle() {
+			delete(sas.idleKeys, sa.Name)
+			delete(sas.SummaryAllocations, sa.Name)
+
+			if options.ShareIdle == ShareEven || options.ShareIdle == ShareWeighted {
+				idleSet.Insert(sa)
+			} else {
+				resultSet.Insert(sa)
+			}
+
+			continue
+		}
+
+		// Track total unmounted cost because it must be taken out of total
+		// allocated costs for sharing coefficients.
+		if sa.IsUnmounted() {
+			totalUnmountedCost += sa.TotalCost()
+		}
+	}
+
+	// It's possible that no more un-shared, non-idle, non-external allocations
+	// remain at this point. This always results in an emptySet, so return early.
+	if len(sas.SummaryAllocations) == 0 {
+		sas.SummaryAllocations = map[string]*SummaryAllocation{}
+		return nil
+	}
+
+	// 3. Retrieve pre-computed allocation resource totals, which will be used
+	// to compute idle coefficients, based on the ratio of an allocation's per-
+	// resource cost to the per-resource totals of that allocation's cluster or
+	// node. Whether to perform this operation based on cluster or node is an
+	// option. (See IdleByNode documentation; defaults to idle-by-cluster.)
+	var allocTotals map[string]*AllocationTotals
+	var ok bool
+	if options.IdleByNode {
+		if options.AllocationTotalsStore != nil {
+			allocTotals, ok = options.AllocationTotalsStore.GetAllocationTotalsByNode(*sas.Window.Start(), *sas.Window.End())
+			if !ok {
+				return fmt.Errorf("nil allocation resource totals by node for %s", sas.Window)
+			}
+		}
+	} else {
+		if options.AllocationTotalsStore != nil {
+			allocTotals, ok = options.AllocationTotalsStore.GetAllocationTotalsByCluster(*sas.Window.Start(), *sas.Window.End())
+			if !ok {
+				return fmt.Errorf("nil allocation resource totals by cluster for %s", sas.Window)
+			}
+		}
+	}
+
+	// If filters have been applied, then we need to record allocation resource
+	// totals after filtration (i.e. the allocations that are present) so that
+	// we can identify the proportion of idle cost to keep. That is, we should
+	// only return the idle cost that would be shared with the remaining
+	// allocations, even if we're keeping idle separate. The totals should be
+	// recorded by idle-key (cluster or node, depending on the IdleByNode
+	// option). Instantiating this map is a signal to record the totals.
+	var allocTotalsAfterFilters map[string]*AllocationTotals
+	if len(resultSet.idleKeys) > 0 && len(options.FilterFuncs) > 0 {
+		allocTotalsAfterFilters = make(map[string]*AllocationTotals, len(resultSet.idleKeys))
+	}
+
+	// If we're recording allocTotalsAfterFilters and there are shared costs,
+	// then record those resource totals here so that idle for thpse shared
+	// resources gets included.
+	if allocTotalsAfterFilters != nil {
+		for key, rt := range sharedResourceTotals {
+			if _, ok := allocTotalsAfterFilters[key]; !ok {
+				allocTotalsAfterFilters[key] = &AllocationTotals{}
+			}
+
+			// Record only those fields required for computing idle
+			allocTotalsAfterFilters[key].CPUCost += rt.CPUCost
+			allocTotalsAfterFilters[key].GPUCost += rt.GPUCost
+			allocTotalsAfterFilters[key].RAMCost += rt.RAMCost
+		}
+	}
+
+	// 4. Convert shared hourly cost into a cumulative allocation to share,
+	// and insert it into the share set.
+	for name, cost := range options.SharedHourlyCosts {
+		if cost > 0.0 {
+			hours := sas.Window.Hours()
+
+			// If set ends in the future, adjust hours accordingly
+			diff := time.Since(*sas.Window.End())
+			if diff < 0.0 {
+				hours += diff.Hours()
+			}
+
+			totalSharedCost := cost * hours
+
+			shareSet.Insert(&SummaryAllocation{
+				Name:       fmt.Sprintf("%s/%s", name, SharedSuffix),
+				Properties: &AllocationProperties{},
+				Start:      *sas.Window.Start(),
+				End:        *sas.Window.End(),
+				SharedCost: totalSharedCost,
+			})
+		}
+	}
+
+	// Sharing coefficients are recorded by post-aggregation-key (e.g. if
+	// aggregating by namespace, then the key will be the namespace) and only
+	// need to be recorded if there are shared resources. Instantiating this
+	// map is the signal to record sharing coefficients.
+	var sharingCoeffs map[string]float64
+	if len(shareSet.SummaryAllocations) > 0 {
+		sharingCoeffs = map[string]float64{}
+	}
+
+	// Loop over all remaining SummaryAllocations (after filters, sharing, &c.)
+	// doing the following, in this order:
+	//  5. Compute sharing coefficients, if there are shared resources
+	//  6. Distribute idle cost, if sharing idle
+	//  7. Record allocTotalsAfterFiltration, if filters have been applied
+	//  8. Aggregate by key
+	for _, sa := range sas.SummaryAllocations {
+		// Generate key to use for aggregation-by-key and allocation name
+		key := sa.generateKey(aggregateBy, options.LabelConfig)
+
+		// 5. Incrementally add to sharing coefficients before adding idle
+		// cost, which would skew the coefficients. These coefficients will be
+		// later divided by a total, turning them into a coefficient between
+		// 0.0 and 1.0.
+		// NOTE: SummaryAllocation does not support ShareEven, so only record
+		// by cost for cost-weighted distribution.
+		if sharingCoeffs != nil {
+			sharingCoeffs[key] += sa.TotalCost()
+		}
+
+		// 6. Distribute idle allocations according to the idle coefficients.
+		// NOTE: if idle allocation is off (i.e. ShareIdle == ShareNone) then
+		// all idle allocations will be in the resultSet at this point, so idleSet
+		// will be empty and we won't enter this block.
+		if len(idleSet.SummaryAllocations) > 0 {
+			for _, idle := range idleSet.SummaryAllocations {
+				// Idle key is either cluster or node, as determined by the
+				// IdleByNode option.
+				var key string
+
+				// Only share idle allocation with current allocation (sa) if
+				// the relevant properties match (i.e. cluster and/or node)
+				if idle.Properties.Cluster != sa.Properties.Cluster {
+					continue
+				}
+				key = idle.Properties.Cluster
+
+				if options.IdleByNode {
+					if idle.Properties.Node != sa.Properties.Node {
+						continue
+					}
+					key = fmt.Sprintf("%s/%s", idle.Properties.Cluster, idle.Properties.Node)
+				}
+
+				cpuCoeff, gpuCoeff, ramCoeff := ComputeIdleCoefficients(options.ShareIdle, key, sa.CPUCost, sa.GPUCost, sa.RAMCost, allocTotals)
+
+				sa.CPUCost += idle.CPUCost * cpuCoeff
+				sa.GPUCost += idle.GPUCost * gpuCoeff
+				sa.RAMCost += idle.RAMCost * ramCoeff
+			}
+		}
+
+		// The key becomes the allocation's name, which is used as the key by
+		// which the allocation is inserted into the set.
+		sa.Name = key
+
+		// If merging unallocated allocations, rename all unallocated
+		// allocations as simply __unallocated__
+		if options.MergeUnallocated && sa.IsUnallocated() {
+			sa.Name = UnallocatedSuffix
+		}
+
+		// 7. Record filtered resource totals for idle allocation filtration,
+		// only if necessary.
+		if allocTotalsAfterFilters != nil {
+			key := sa.Properties.Cluster
+			if options.IdleByNode {
+				key = fmt.Sprintf("%s/%s", sa.Properties.Cluster, sa.Properties.Node)
+			}
+
+			if _, ok := allocTotalsAfterFilters[key]; ok {
+				allocTotalsAfterFilters[key].CPUCost += sa.CPUCost
+				allocTotalsAfterFilters[key].GPUCost += sa.GPUCost
+				allocTotalsAfterFilters[key].RAMCost += sa.RAMCost
+			} else {
+				allocTotalsAfterFilters[key] = &AllocationTotals{
+					CPUCost: sa.CPUCost,
+					GPUCost: sa.GPUCost,
+					RAMCost: sa.RAMCost,
+				}
+			}
+		}
+
+		// 8. Inserting the allocation with the generated key for a name
+		// performs the actual aggregation step.
+		resultSet.Insert(sa)
+	}
+
+	// 9. If idle is shared and resources are shared, it's probable that some
+	// amount of idle cost will be shared with a shared resource. Distribute
+	// that idle cost, if it exists, among the respective shared allocations
+	// before sharing them with the aggregated allocations.
+	if len(idleSet.SummaryAllocations) > 0 && len(shareSet.SummaryAllocations) > 0 {
+		for _, sa := range shareSet.SummaryAllocations {
+			for _, idle := range idleSet.SummaryAllocations {
+				var key string
+
+				// Only share idle allocation with current allocation (sa) if
+				// the relevant property matches (i.e. Cluster or Node,
+				// depending on which idle sharing option is selected)
+				if options.IdleByNode {
+					if idle.Properties.Node != sa.Properties.Node {
+						continue
+					}
+
+					key = idle.Properties.Node
+				} else {
+					if idle.Properties.Cluster != sa.Properties.Cluster {
+						continue
+					}
+
+					key = idle.Properties.Cluster
+				}
+
+				cpuCoeff, gpuCoeff, ramCoeff := ComputeIdleCoefficients(options.ShareIdle, key, sa.CPUCost, sa.GPUCost, sa.RAMCost, allocTotals)
+
+				sa.CPUCost += idle.CPUCost * cpuCoeff
+				sa.GPUCost += idle.GPUCost * gpuCoeff
+				sa.RAMCost += idle.RAMCost * ramCoeff
+			}
+		}
+	}
+
+	// 10. Apply idle filtration, which "filters" the idle cost, i.e. scales
+	// idle allocation costs per-resource by the proportion of allocation
+	// resources remaining after filtering. In effect, this returns only the
+	// idle costs that would have been shared with the remaining allocations,
+	// even if idle is kept separated.
+	if allocTotalsAfterFilters != nil {
+		for idleKey := range resultSet.idleKeys {
+			ia := resultSet.SummaryAllocations[idleKey]
+
+			var key string
+			if options.IdleByNode {
+				key = ia.Properties.Node
+			} else {
+				key = ia.Properties.Cluster
+			}
+
+			// Percentage of idle that should remain after filters are applied,
+			// which equals the proportion of filtered-to-actual cost.
+			cpuFilterCoeff := 0.0
+			if allocTotals[key].CPUCost > 0.0 {
+				cpuFilterCoeff = allocTotalsAfterFilters[key].CPUCost / allocTotals[key].CPUCost
+			}
+
+			gpuFilterCoeff := 0.0
+			if allocTotals[key].RAMCost > 0.0 {
+				gpuFilterCoeff = allocTotalsAfterFilters[key].RAMCost / allocTotals[key].RAMCost
+			}
+
+			ramFilterCoeff := 0.0
+
+			if allocTotals[key].RAMCost > 0.0 {
+				ramFilterCoeff = allocTotalsAfterFilters[key].RAMCost / allocTotals[key].RAMCost
+			}
+
+			ia.CPUCost *= cpuFilterCoeff
+			ia.GPUCost *= gpuFilterCoeff
+			ia.RAMCost *= ramFilterCoeff
+		}
+	}
+
+	// 11. Distribute shared resources according to sharing coefficients.
+	// NOTE: ShareEven is not supported
+	if len(shareSet.SummaryAllocations) > 0 {
+		sharingCoeffDenominator := 0.0
+		for _, rt := range allocTotals {
+			sharingCoeffDenominator += rt.TotalCost()
+		}
+
+		// Do not include the shared costs, themselves, when determining
+		// sharing coefficients.
+		for _, rt := range sharedResourceTotals {
+			sharingCoeffDenominator -= rt.TotalCost()
+		}
+
+		// Do not include the unmounted costs when determining sharing
+		// coefficients becuase they do not receive shared costs.
+		sharingCoeffDenominator -= totalUnmountedCost
+
+		if sharingCoeffDenominator <= 0.0 {
+			log.Warningf("SummaryAllocation: sharing coefficient denominator is %f", sharingCoeffDenominator)
+		} else {
+			// Compute sharing coeffs by dividing the thus-far accumulated
+			// numerators by the now-finalized denominator.
+			for key := range sharingCoeffs {
+				sharingCoeffs[key] /= sharingCoeffDenominator
+			}
+
+			for key, sa := range resultSet.SummaryAllocations {
+				// Idle and unmounted allocations, by definition, do not
+				// receive shared cost
+				if sa.IsIdle() || sa.IsUnmounted() {
+					continue
+				}
+
+				sharingCoeff := sharingCoeffs[key]
+
+				// Distribute each shared cost with the current allocation on the
+				// basis of the proportion of the allocation's cost (ShareWeighted)
+				// or count (ShareEven) to the total aggregated cost or count. This
+				// condition should hold in spite of filters because the sharing
+				// coefficient denominator is held constant by pre-computed
+				// resource totals and the post-aggregation total cost of the
+				// remaining allocations will, by definition, not be affected.
+				for _, shared := range shareSet.SummaryAllocations {
+					sa.SharedCost += shared.TotalCost() * sharingCoeff
+				}
+			}
+		}
+	}
+
+	// 12. Insert external allocations into the result set.
+	for _, sa := range externalSet.SummaryAllocations {
+		skip := false
+
+		for _, ff := range options.FilterFuncs {
+			// Make an allocation with the same properties and test that
+			// against the FilterFunc to see if the external allocation should
+			// be filtered or not.
+			// TODO:CLEANUP do something about external cost, this stinks
+			ea := &Allocation{Properties: sa.Properties}
+			if !ff(ea) {
+				skip = true
+				break
+			}
+		}
+
+		if !skip {
+			key := sa.generateKey(aggregateBy, options.LabelConfig)
+
+			sa.Name = key
+			resultSet.Insert(sa)
+		}
+	}
+
+	// 13. Distribute remaining, undistributed idle. Undistributed idle is any
+	// per-resource idle cost for which there can be no idle coefficient
+	// computed because there is zero usage across all allocations.
+	for _, ia := range idleSet.SummaryAllocations {
+		key := ia.Properties.Cluster
+		if options.IdleByNode {
+			key = fmt.Sprintf("%s/%s", ia.Properties.Cluster, ia.Properties.Node)
+		}
+
+		rt, ok := allocTotals[key]
+		if !ok {
+			log.Warningf("SummaryAllocation: AggregateBy: cannot handle undistributed idle for '%s'", key)
+			continue
+		}
+
+		hasUndistributableCost := false
+
+		if ia.CPUCost > 0.0 && rt.CPUCost == 0.0 {
+			// There is idle CPU cost, but no allocated CPU cost, so that cost
+			// is undistributable and must be inserted.
+			hasUndistributableCost = true
+		} else {
+			// Cost was entirely distributed, so zero it out
+			ia.CPUCost = 0.0
+		}
+
+		if ia.GPUCost > 0.0 && rt.GPUCost == 0.0 {
+			// There is idle GPU cost, but no allocated GPU cost, so that cost
+			// is undistributable and must be inserted.
+			hasUndistributableCost = true
+		} else {
+			// Cost was entirely distributed, so zero it out
+			ia.GPUCost = 0.0
+		}
+
+		if ia.RAMCost > 0.0 && rt.RAMCost == 0.0 {
+			// There is idle CPU cost, but no allocated CPU cost, so that cost
+			// is undistributable and must be inserted.
+			hasUndistributableCost = true
+		} else {
+			// Cost was entirely distributed, so zero it out
+			ia.RAMCost = 0.0
+		}
+
+		if hasUndistributableCost {
+			ia.Name = fmt.Sprintf("%s/%s", key, IdleSuffix)
+			resultSet.Insert(ia)
+		}
+	}
+
+	// 14. Combine all idle allocations into a single idle allocation, unless
+	// the option to keep idle split by cluster or node is enabled.
+	if !options.SplitIdle {
+		for _, ia := range resultSet.idleAllocations() {
+			resultSet.Delete(ia.Name)
+			ia.Name = IdleSuffix
+			resultSet.Insert(ia)
+		}
+	}
+
+	// Replace the existing set's data with the new, aggregated summary data
+	sas.SummaryAllocations = resultSet.SummaryAllocations
+
+	return nil
+}
+
+// Delete removes the allocation with the given name from the set
+func (sas *SummaryAllocationSet) Delete(name string) {
+	if sas == nil {
+		return
+	}
+
+	sas.Lock()
+	defer sas.Unlock()
+
+	delete(sas.externalKeys, name)
+	delete(sas.idleKeys, name)
+	delete(sas.SummaryAllocations, name)
+}
+
+// Each invokes the given function for each SummaryAllocation in the set
+func (sas *SummaryAllocationSet) Each(f func(string, *SummaryAllocation)) {
+	if sas == nil {
+		return
+	}
+
+	for k, a := range sas.SummaryAllocations {
+		f(k, a)
+	}
+}
+
+// IdleAllocations returns a map of the idle allocations in the AllocationSet.
+func (sas *SummaryAllocationSet) idleAllocations() map[string]*SummaryAllocation {
+	idles := map[string]*SummaryAllocation{}
+
+	if sas == nil || len(sas.SummaryAllocations) == 0 {
+		return idles
+	}
+
+	sas.RLock()
+	defer sas.RUnlock()
+
+	for key := range sas.idleKeys {
+		if sa, ok := sas.SummaryAllocations[key]; ok {
+			idles[key] = sa
+		}
+	}
+
+	return idles
+}
+
+// Insert aggregates the current entry in the SummaryAllocationSet by the given Allocation,
+// but only if the Allocation is valid, i.e. matches the SummaryAllocationSet's window. If
+// there is no existing entry, one is created. Nil error response indicates success.
+func (sas *SummaryAllocationSet) Insert(sa *SummaryAllocation) error {
+	if sas == nil {
+		return fmt.Errorf("cannot insert into nil SummaryAllocationSet")
+	}
+
+	if sa == nil {
+		return fmt.Errorf("cannot insert a nil SummaryAllocation")
+	}
+
+	sas.Lock()
+	defer sas.Unlock()
+
+	if sas.SummaryAllocations == nil {
+		sas.SummaryAllocations = map[string]*SummaryAllocation{}
+	}
+
+	if sas.externalKeys == nil {
+		sas.externalKeys = map[string]bool{}
+	}
+
+	if sas.idleKeys == nil {
+		sas.idleKeys = map[string]bool{}
+	}
+
+	// Add the given Allocation to the existing entry, if there is one;
+	// otherwise just set directly into allocations
+	if _, ok := sas.SummaryAllocations[sa.Name]; ok {
+		err := sas.SummaryAllocations[sa.Name].Add(sa)
+		if err != nil {
+			return fmt.Errorf("SummaryAllocationSet.Insert: error trying to Add: %s", err)
+		}
+	} else {
+		sas.SummaryAllocations[sa.Name] = sa
+	}
+
+	// If the given Allocation is an external one, record that
+	if sa.IsExternal() {
+		sas.externalKeys[sa.Name] = true
+	}
+
+	// If the given Allocation is an idle one, record that
+	if sa.IsIdle() {
+		sas.idleKeys[sa.Name] = true
+	}
+
+	return nil
+}
+
+// SummaryAllocationSetRange is a thread-safe slice of SummaryAllocationSets.
+type SummaryAllocationSetRange struct {
+	sync.RWMutex
+	Step                  time.Duration           `json:"step"`
+	SummaryAllocationSets []*SummaryAllocationSet `json:"sets"`
+	Window                Window                  `json:"window"`
+}
+
+// NewSummaryAllocationSetRange instantiates a new range composed of the given
+// SummaryAllocationSets in the order provided. The expectations about the
+// SummaryAllocationSets are as follows:
+// - window durations are all equal
+// - sets are consecutive (i.e. chronologically sorted)
+// - there are no gaps between sets
+// - sets do not have overlapping windows
+func NewSummaryAllocationSetRange(sass ...*SummaryAllocationSet) *SummaryAllocationSetRange {
+	var step time.Duration
+	window := NewWindow(nil, nil)
+
+	for _, sas := range sass {
+		if window.Start() == nil || (sas.Window.Start() != nil && sas.Window.Start().Before(*window.Start())) {
+			window.start = sas.Window.Start()
+		}
+		if window.End() == nil || (sas.Window.End() != nil && sas.Window.End().After(*window.End())) {
+			window.end = sas.Window.End()
+		}
+		if step == 0 {
+			step = sas.Window.Duration()
+		} else if step != sas.Window.Duration() {
+			log.Warningf("instantiating range with step %s using set of step %s is illegal", step, sas.Window.Duration())
+		}
+	}
+
+	return &SummaryAllocationSetRange{
+		Step:                  step,
+		SummaryAllocationSets: sass,
+		Window:                window,
+	}
+}
+
+// Accumulate sums each AllocationSet in the given range, returning a single cumulative
+// AllocationSet for the entire range.
+func (sasr *SummaryAllocationSetRange) Accumulate() (*SummaryAllocationSet, error) {
+	var result *SummaryAllocationSet
+	var err error
+
+	sasr.RLock()
+	defer sasr.RUnlock()
+
+	for _, sas := range sasr.SummaryAllocationSets {
+		result, err = result.Add(sas)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return result, nil
+}
+
+// AggregateBy aggregates each AllocationSet in the range by the given
+// properties and options.
+func (sasr *SummaryAllocationSetRange) AggregateBy(aggregateBy []string, options *AllocationAggregationOptions) error {
+	sasr.Lock()
+	defer sasr.Unlock()
+
+	for _, sas := range sasr.SummaryAllocationSets {
+		err := sas.AggregateBy(aggregateBy, options)
+		if err != nil {
+			// Wipe out data so that corrupt data cannot be mistakenly used
+			sasr.SummaryAllocationSets = []*SummaryAllocationSet{}
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Append appends the given AllocationSet to the end of the range. It does not
+// validate whether or not that violates window continuity.
+func (sasr *SummaryAllocationSetRange) Append(sas *SummaryAllocationSet) error {
+	if sasr.Step != 0 && sas.Window.Duration() != sasr.Step {
+		return fmt.Errorf("cannot append set with duration %s to range of step %s", sas.Window.Duration(), sasr.Step)
+	}
+
+	sasr.Lock()
+	defer sasr.Unlock()
+
+	// Append to list of sets
+	sasr.SummaryAllocationSets = append(sasr.SummaryAllocationSets, sas)
+
+	// Set step, if not set
+	if sasr.Step == 0 {
+		sasr.Step = sas.Window.Duration()
+	}
+
+	// Adjust window
+	if sasr.Window.Start() == nil || (sas.Window.Start() != nil && sas.Window.Start().Before(*sasr.Window.Start())) {
+		sasr.Window.start = sas.Window.Start()
+	}
+	if sasr.Window.End() == nil || (sas.Window.End() != nil && sas.Window.End().After(*sasr.Window.End())) {
+		sasr.Window.end = sas.Window.End()
+	}
+
+	return nil
+}
+
+// Each invokes the given function for each AllocationSet in the range
+func (sasr *SummaryAllocationSetRange) Each(f func(int, *SummaryAllocationSet)) {
+	if sasr == nil {
+		return
+	}
+
+	for i, as := range sasr.SummaryAllocationSets {
+		f(i, as)
+	}
+}
+
+// InsertExternalAllocations takes all allocations in the given
+// AllocationSetRange (they should all be considered "external") and inserts
+// them into the receiving SummaryAllocationSetRange.
+// TODO:CLEANUP replace this with a better idea (or get rid of external
+// allocations, as such, altogether)
+func (sasr *SummaryAllocationSetRange) InsertExternalAllocations(that *AllocationSetRange) error {
+	if sasr == nil {
+		return fmt.Errorf("cannot insert range into nil AllocationSetRange")
+	}
+
+	// keys maps window to index in range
+	keys := map[string]int{}
+	for i, as := range sasr.SummaryAllocationSets {
+		if as == nil {
+			continue
+		}
+		keys[as.Window.String()] = i
+	}
+
+	// Nothing to merge, so simply return
+	if len(keys) == 0 {
+		return nil
+	}
+
+	var err error
+	that.Each(func(j int, thatAS *AllocationSet) {
+		if thatAS == nil || err != nil {
+			return
+		}
+
+		// Find matching AllocationSet in asr
+		i, ok := keys[thatAS.Window.String()]
+		if !ok {
+			err = fmt.Errorf("cannot merge AllocationSet into window that does not exist: %s", thatAS.Window.String())
+			return
+		}
+		sas := sasr.SummaryAllocationSets[i]
+
+		// Insert each Allocation from the given set
+		thatAS.Each(func(k string, alloc *Allocation) {
+			externalSA := NewSummaryAllocation(alloc, true, true)
+			// This error will be returned below
+			// TODO:CLEANUP should Each have early-error-return functionality?
+			err = sas.Insert(externalSA)
+		})
+	})
+
+	// err might be nil
+	return err
+}

+ 212 - 0
pkg/kubecost/summaryallocation_test.go

@@ -0,0 +1,212 @@
+package kubecost
+
+import (
+	"testing"
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/util"
+)
+
+func TestSummaryAllocation_Add(t *testing.T) {
+	window, _ := ParseWindowUTC("yesterday")
+
+	var sa1, sa2, osa1, osa2, nilsa *SummaryAllocation
+	var err error
+
+	sa1Start := *window.Start()
+
+	sa1End := *window.End()
+
+	sa1 = &SummaryAllocation{
+		Name: "cluster1/namespace1/pod1/container1",
+		Properties: &AllocationProperties{
+			Cluster:   "cluster1",
+			Namespace: "namespace1",
+			Pod:       "pod1",
+			Container: "container1",
+		},
+		Start:                  sa1Start,
+		End:                    sa1End,
+		CPUCoreRequestAverage:  0.5,
+		CPUCoreUsageAverage:    0.1,
+		CPUCost:                0.2,
+		GPUCost:                1.0,
+		NetworkCost:            0.1,
+		LoadBalancerCost:       0.6,
+		PVCost:                 0.005,
+		RAMBytesRequestAverage: 50.0 * 1024.0 * 1024.0,
+		RAMBytesUsageAverage:   10.0 * 1024.0 * 1024.0,
+		RAMCost:                0.05,
+		SharedCost:             1.0,
+		ExternalCost:           1.0,
+	}
+	osa1 = sa1.Clone()
+
+	// sa2 is just as expensive, with twice as much usage and request, and half
+	// the time compared to sa1
+
+	sa2Start := *window.Start()
+	sa2Start = sa2Start.Add(6 * time.Hour)
+
+	sa2End := *window.End()
+	sa2End = sa2End.Add(-6 * time.Hour)
+
+	sa2 = &SummaryAllocation{
+		Name: "cluster1/namespace1/pod2/container2",
+		Properties: &AllocationProperties{
+			Cluster:   "cluster1",
+			Namespace: "namespace1",
+			Pod:       "pod2",
+			Container: "container2",
+		},
+		Start:                  sa2Start,
+		End:                    sa2End,
+		CPUCoreRequestAverage:  sa1.CPUCoreRequestAverage * 2.0,
+		CPUCoreUsageAverage:    sa1.CPUCoreUsageAverage * 2.0,
+		CPUCost:                sa1.CPUCost,
+		GPUCost:                sa1.GPUCost,
+		NetworkCost:            sa1.NetworkCost,
+		LoadBalancerCost:       sa1.LoadBalancerCost,
+		PVCost:                 sa1.PVCost,
+		RAMBytesRequestAverage: sa1.RAMBytesRequestAverage * 2.0,
+		RAMBytesUsageAverage:   sa1.RAMBytesUsageAverage * 2.0,
+		RAMCost:                sa1.RAMCost,
+		SharedCost:             sa1.SharedCost,
+		ExternalCost:           sa1.ExternalCost,
+	}
+	osa2 = sa2.Clone()
+
+	// add nil to nil, expect and error
+	t.Run("nil.Add(nil)", func(t *testing.T) {
+		err = nilsa.Add(nilsa)
+		if err == nil {
+			t.Fatalf("expected error: cannot add nil SummaryAllocations")
+		}
+	})
+
+	// reset
+	sa1 = osa1.Clone()
+	sa2 = osa2.Clone()
+
+	// add sa1 to nil, expect and error
+	t.Run("nil.Add(sa1)", func(t *testing.T) {
+		err = nilsa.Add(sa1)
+		if err == nil {
+			t.Fatalf("expected error: cannot add nil SummaryAllocations")
+		}
+	})
+
+	// reset
+	sa1 = osa1.Clone()
+	sa2 = osa2.Clone()
+
+	// add nil to sa1, expect and error
+	t.Run("sa1.Add(nil)", func(t *testing.T) {
+		err = sa1.Add(nilsa)
+		if err == nil {
+			t.Fatalf("expected error: cannot add nil SummaryAllocations")
+		}
+	})
+
+	// reset
+	sa1 = osa1.Clone()
+	sa2 = osa2.Clone()
+
+	// add sa1 to sa2 and expect same averages, but double costs
+	t.Run("sa2.Add(sa1)", func(t *testing.T) {
+		err = sa2.Add(sa1)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		if sa2.Properties != nil {
+			t.Fatalf("expected properties to be nil; actual: %s", sa1.Properties)
+		}
+		if !util.IsApproximately(sa2.CPUCoreRequestAverage, (0.5*osa2.CPUCoreRequestAverage)+osa1.CPUCoreRequestAverage) {
+			t.Fatalf("incorrect CPUCoreRequestAverage: expected %.5f; actual %.5f", (0.5*osa2.CPUCoreRequestAverage)+osa1.CPUCoreRequestAverage, sa2.CPUCoreRequestAverage)
+		}
+		if !util.IsApproximately(sa2.CPUCoreUsageAverage, (0.5*osa2.CPUCoreUsageAverage)+osa1.CPUCoreUsageAverage) {
+			t.Fatalf("incorrect CPUCoreUsageAverage: expected %.5f; actual %.5f", (0.5*osa2.CPUCoreUsageAverage)+osa1.CPUCoreRequestAverage, sa2.CPUCoreUsageAverage)
+		}
+		if !util.IsApproximately(sa2.RAMBytesRequestAverage, (0.5*osa2.RAMBytesRequestAverage)+osa1.RAMBytesRequestAverage) {
+			t.Fatalf("incorrect RAMBytesRequestAverage: expected %.5f; actual %.5f", (0.5*osa2.RAMBytesRequestAverage)+osa1.RAMBytesRequestAverage, sa2.RAMBytesRequestAverage)
+		}
+		if !util.IsApproximately(sa2.RAMBytesUsageAverage, (0.5*osa2.RAMBytesUsageAverage)+osa1.RAMBytesUsageAverage) {
+			t.Fatalf("incorrect RAMBytesUsageAverage: expected %.5f; actual %.5f", (0.5*osa2.RAMBytesUsageAverage)+osa1.RAMBytesRequestAverage, sa2.RAMBytesUsageAverage)
+		}
+		if !util.IsApproximately(sa2.CPUCost, osa2.CPUCost+osa1.CPUCost) {
+			t.Fatalf("incorrect CPUCost: expected %.5f; actual %.5f", osa2.CPUCost+osa1.CPUCost, sa2.CPUCost)
+		}
+		if !util.IsApproximately(sa2.GPUCost, osa2.GPUCost+osa1.GPUCost) {
+			t.Fatalf("incorrect GPUCost: expected %.5f; actual %.5f", osa2.GPUCost+osa1.GPUCost, sa2.GPUCost)
+		}
+		if !util.IsApproximately(sa2.NetworkCost, osa2.NetworkCost+osa1.NetworkCost) {
+			t.Fatalf("incorrect NetworkCost: expected %.5f; actual %.5f", osa2.NetworkCost+osa1.NetworkCost, sa2.NetworkCost)
+		}
+		if !util.IsApproximately(sa2.LoadBalancerCost, osa2.LoadBalancerCost+osa1.LoadBalancerCost) {
+			t.Fatalf("incorrect LoadBalancerCost: expected %.5f; actual %.5f", osa2.LoadBalancerCost+osa1.LoadBalancerCost, sa2.LoadBalancerCost)
+		}
+		if !util.IsApproximately(sa2.PVCost, osa2.PVCost+osa1.PVCost) {
+			t.Fatalf("incorrect PVCost: expected %.5f; actual %.5f", osa2.PVCost+osa1.PVCost, sa2.PVCost)
+		}
+		if !util.IsApproximately(sa2.RAMCost, osa2.RAMCost+osa1.RAMCost) {
+			t.Fatalf("incorrect RAMCost: expected %.5f; actual %.5f", osa2.RAMCost+osa1.RAMCost, sa2.RAMCost)
+		}
+		if !util.IsApproximately(sa2.SharedCost, osa2.SharedCost+osa1.SharedCost) {
+			t.Fatalf("incorrect SharedCost: expected %.5f; actual %.5f", osa2.SharedCost+osa1.SharedCost, sa2.SharedCost)
+		}
+		if !util.IsApproximately(sa2.ExternalCost, osa2.ExternalCost+osa1.ExternalCost) {
+			t.Fatalf("incorrect ExternalCost: expected %.5f; actual %.5f", osa2.ExternalCost+osa1.ExternalCost, sa2.ExternalCost)
+		}
+	})
+
+	// reset
+	sa1 = osa1.Clone()
+	sa2 = osa2.Clone()
+
+	// add sa2 to sa1 and expect same averages, but double costs
+	t.Run("sa1.Add(sa2)", func(t *testing.T) {
+		err = sa1.Add(sa2)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		if sa1.Properties != nil {
+			t.Fatalf("expected properties to be nil; actual: %s", sa1.Properties)
+		}
+		if !util.IsApproximately(sa1.CPUCoreRequestAverage, (0.5*osa2.CPUCoreRequestAverage)+osa1.CPUCoreRequestAverage) {
+			t.Fatalf("incorrect CPUCoreRequestAverage: expected %.5f; actual %.5f", (0.5*osa2.CPUCoreRequestAverage)+osa1.CPUCoreRequestAverage, sa2.CPUCoreRequestAverage)
+		}
+		if !util.IsApproximately(sa1.CPUCoreUsageAverage, (0.5*osa2.CPUCoreUsageAverage)+osa1.CPUCoreUsageAverage) {
+			t.Fatalf("incorrect CPUCoreUsageAverage: expected %.5f; actual %.5f", (0.5*osa2.CPUCoreUsageAverage)+osa1.CPUCoreRequestAverage, sa2.CPUCoreUsageAverage)
+		}
+		if !util.IsApproximately(sa1.RAMBytesRequestAverage, (0.5*osa2.RAMBytesRequestAverage)+osa1.RAMBytesRequestAverage) {
+			t.Fatalf("incorrect RAMBytesRequestAverage: expected %.5f; actual %.5f", (0.5*osa2.RAMBytesRequestAverage)+osa1.RAMBytesRequestAverage, sa2.RAMBytesRequestAverage)
+		}
+		if !util.IsApproximately(sa1.RAMBytesUsageAverage, (0.5*osa2.RAMBytesUsageAverage)+osa1.RAMBytesUsageAverage) {
+			t.Fatalf("incorrect RAMBytesUsageAverage: expected %.5f; actual %.5f", (0.5*osa2.RAMBytesUsageAverage)+osa1.RAMBytesRequestAverage, sa2.RAMBytesUsageAverage)
+		}
+		if !util.IsApproximately(sa1.CPUCost, osa2.CPUCost+osa1.CPUCost) {
+			t.Fatalf("incorrect CPUCost: expected %.5f; actual %.5f", osa2.CPUCost+osa1.CPUCost, sa2.CPUCost)
+		}
+		if !util.IsApproximately(sa1.GPUCost, osa2.GPUCost+osa1.GPUCost) {
+			t.Fatalf("incorrect GPUCost: expected %.5f; actual %.5f", osa2.GPUCost+osa1.GPUCost, sa2.GPUCost)
+		}
+		if !util.IsApproximately(sa1.NetworkCost, osa2.NetworkCost+osa1.NetworkCost) {
+			t.Fatalf("incorrect NetworkCost: expected %.5f; actual %.5f", osa2.NetworkCost+osa1.NetworkCost, sa2.NetworkCost)
+		}
+		if !util.IsApproximately(sa1.LoadBalancerCost, osa2.LoadBalancerCost+osa1.LoadBalancerCost) {
+			t.Fatalf("incorrect LoadBalancerCost: expected %.5f; actual %.5f", osa2.LoadBalancerCost+osa1.LoadBalancerCost, sa2.LoadBalancerCost)
+		}
+		if !util.IsApproximately(sa1.PVCost, osa2.PVCost+osa1.PVCost) {
+			t.Fatalf("incorrect PVCost: expected %.5f; actual %.5f", osa2.PVCost+osa1.PVCost, sa2.PVCost)
+		}
+		if !util.IsApproximately(sa1.RAMCost, osa2.RAMCost+osa1.RAMCost) {
+			t.Fatalf("incorrect RAMCost: expected %.5f; actual %.5f", osa2.RAMCost+osa1.RAMCost, sa2.RAMCost)
+		}
+		if !util.IsApproximately(sa1.SharedCost, osa2.SharedCost+osa1.SharedCost) {
+			t.Fatalf("incorrect SharedCost: expected %.5f; actual %.5f", osa2.SharedCost+osa1.SharedCost, sa2.SharedCost)
+		}
+		if !util.IsApproximately(sa1.ExternalCost, osa2.ExternalCost+osa1.ExternalCost) {
+			t.Fatalf("incorrect ExternalCost: expected %.5f; actual %.5f", osa2.ExternalCost+osa1.ExternalCost, sa2.ExternalCost)
+		}
+	})
+}

+ 524 - 0
pkg/kubecost/totals.go

@@ -0,0 +1,524 @@
+package kubecost
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/patrickmn/go-cache"
+)
+
+// AllocationTotals represents aggregate costs of all Allocations for
+// a given cluster or tuple of (cluster, node) between a given start and end
+// time, where the costs are aggregated per-resource. AllocationTotals
+// is designed to be used as a pre-computed intermediate data structure when
+// contextual knowledge is required to carry out a task, but computing totals
+// on-the-fly would be expensive; e.g. idle allocation; sharing coefficients
+// for idle or shared resources, etc.
+type AllocationTotals struct {
+	Start                          time.Time `json:"start"`
+	End                            time.Time `json:"end"`
+	Cluster                        string    `json:"cluster"`
+	Node                           string    `json:"node"`
+	Count                          int       `json:"count"`
+	CPUCost                        float64   `json:"cpuCost"`
+	CPUCostAdjustment              float64   `json:"cpuCostAdjustment"`
+	GPUCost                        float64   `json:"gpuCost"`
+	GPUCostAdjustment              float64   `json:"gpuCostAdjustment"`
+	LoadBalancerCost               float64   `json:"loadBalancerCost"`
+	LoadBalancerCostAdjustment     float64   `json:"loadBalancerCostAdjustment"`
+	NetworkCost                    float64   `json:"networkCost"`
+	NetworkCostAdjustment          float64   `json:"networkCostAdjustment"`
+	PersistentVolumeCost           float64   `json:"persistentVolumeCost"`
+	PersistentVolumeCostAdjustment float64   `json:"persistentVolumeCostAdjustment"`
+	RAMCost                        float64   `json:"ramCost"`
+	RAMCostAdjustment              float64   `json:"ramCostAdjustment"`
+}
+
+// ClearAdjustments sets all adjustment fields to 0.0
+func (art *AllocationTotals) ClearAdjustments() {
+	art.CPUCostAdjustment = 0.0
+	art.GPUCostAdjustment = 0.0
+	art.RAMCostAdjustment = 0.0
+}
+
+// TotalCPUCost returns CPU cost with adjustment.
+func (art *AllocationTotals) TotalCPUCost() float64 {
+	return art.CPUCost + art.CPUCostAdjustment
+}
+
+// TotalGPUCost returns GPU cost with adjustment.
+func (art *AllocationTotals) TotalGPUCost() float64 {
+	return art.GPUCost + art.GPUCostAdjustment
+}
+
+// TotalRAMCost returns RAM cost with adjustment.
+func (art *AllocationTotals) TotalRAMCost() float64 {
+	return art.RAMCost + art.RAMCostAdjustment
+}
+
+// TotalCost returns the sum of all costs.
+func (art *AllocationTotals) TotalCost() float64 {
+	return art.TotalCPUCost() + art.TotalGPUCost() + art.LoadBalancerCost +
+		art.NetworkCost + art.PersistentVolumeCost + art.TotalRAMCost()
+}
+
+// ComputeAllocationTotals totals the resource costs of the given AllocationSet
+// using the given property, i.e. cluster or node, where "node" really means to
+// use the fully-qualified (cluster, node) tuple.
+func ComputeAllocationTotals(as *AllocationSet, prop string) map[string]*AllocationTotals {
+	arts := map[string]*AllocationTotals{}
+
+	as.Each(func(name string, alloc *Allocation) {
+		// Do not count idle or unmounted allocations
+		if alloc.IsIdle() || alloc.IsUnmounted() {
+			return
+		}
+
+		// Default to computing totals by Cluster, but allow override to use Node.
+		key := alloc.Properties.Cluster
+		if prop == AllocationNodeProp {
+			key = fmt.Sprintf("%s/%s", alloc.Properties.Cluster, alloc.Properties.Node)
+		}
+
+		if _, ok := arts[key]; !ok {
+			arts[key] = &AllocationTotals{
+				Start:   alloc.Start,
+				End:     alloc.End,
+				Cluster: alloc.Properties.Cluster,
+				Node:    alloc.Properties.Node,
+			}
+		}
+
+		if arts[key].Start.After(alloc.Start) {
+			arts[key].Start = alloc.Start
+		}
+		if arts[key].End.Before(alloc.End) {
+			arts[key].End = alloc.End
+		}
+
+		if arts[key].Node != alloc.Properties.Node {
+			arts[key].Node = ""
+		}
+
+		arts[key].Count++
+		arts[key].CPUCost += alloc.CPUCost
+		arts[key].CPUCostAdjustment += alloc.CPUCostAdjustment
+		arts[key].GPUCost += alloc.GPUCost
+		arts[key].GPUCostAdjustment += alloc.GPUCostAdjustment
+		arts[key].LoadBalancerCost += alloc.LBTotalCost()
+		arts[key].NetworkCost += alloc.NetworkTotalCost()
+		arts[key].PersistentVolumeCost += alloc.PVCost()
+		arts[key].RAMCost += alloc.RAMCost
+		arts[key].RAMCostAdjustment += alloc.RAMCostAdjustment
+	})
+
+	return arts
+}
+
+// AssetTotals represents aggregate costs of all Assets for a given
+// cluster or tuple of (cluster, node) between a given start and end time,
+// where the costs are aggregated per-resource. AssetTotals is designed
+// to be used as a pre-computed intermediate data structure when contextual
+// knowledge is required to carry out a task, but computing totals on-the-fly
+// would be expensive; e.g. idle allocation, shared tenancy costs
+type AssetTotals struct {
+	Start                 time.Time `json:"start"`
+	End                   time.Time `json:"end"`
+	Cluster               string    `json:"cluster"`
+	Node                  string    `json:"node"`
+	Count                 int       `json:"count"`
+	AttachedVolumeCost    float64   `json:"attachedVolumeCost"`
+	ClusterManagementCost float64   `json:"clusterManagementCost"`
+	CPUCost               float64   `json:"cpuCost"`
+	CPUCostAdjustment     float64   `json:"cpuCostAdjustment"`
+	GPUCost               float64   `json:"gpuCost"`
+	GPUCostAdjustment     float64   `json:"gpuCostAdjustment"`
+	PersistentVolumeCost  float64   `json:"persistentVolumeCost"`
+	RAMCost               float64   `json:"ramCost"`
+	RAMCostAdjustment     float64   `json:"ramCostAdjustment"`
+}
+
+// ClearAdjustments sets all adjustment fields to 0.0
+func (art *AssetTotals) ClearAdjustments() {
+	art.CPUCostAdjustment = 0.0
+	art.GPUCostAdjustment = 0.0
+	art.RAMCostAdjustment = 0.0
+}
+
+// TotalCPUCost returns CPU cost with adjustment.
+func (art *AssetTotals) TotalCPUCost() float64 {
+	return art.CPUCost + art.CPUCostAdjustment
+}
+
+// TotalGPUCost returns GPU cost with adjustment.
+func (art *AssetTotals) TotalGPUCost() float64 {
+	return art.GPUCost + art.GPUCostAdjustment
+}
+
+// TotalRAMCost returns RAM cost with adjustment.
+func (art *AssetTotals) TotalRAMCost() float64 {
+	return art.RAMCost + art.RAMCostAdjustment
+}
+
+// TotalCost returns the sum of all costs
+func (art *AssetTotals) TotalCost() float64 {
+	return art.AttachedVolumeCost + art.ClusterManagementCost + art.TotalCPUCost() +
+		art.TotalGPUCost() + art.PersistentVolumeCost + art.TotalRAMCost()
+}
+
+// ComputeAssetTotals totals the resource costs of the given AssetSet,
+// using the given property, i.e. cluster or node, where "node" really means to
+// use the fully-qualified (cluster, node) tuple.
+// NOTE: we're not capturing LoadBalancers here yet, but only because we don't
+// yet need them. They could be added.
+func ComputeAssetTotals(as *AssetSet, prop AssetProperty) map[string]*AssetTotals {
+	arts := map[string]*AssetTotals{}
+
+	// Attached disks are tracked by matching their name with the name of the
+	// node, as is standard for attached disks.
+	nodeNames := map[string]bool{}
+	disks := map[string]*Disk{}
+
+	as.Each(func(name string, asset Asset) {
+		if node, ok := asset.(*Node); ok {
+			// Default to computing totals by Cluster, but allow override to use Node.
+			key := node.Properties().Cluster
+			if prop == AssetNodeProp {
+				key = fmt.Sprintf("%s/%s", node.Properties().Cluster, node.Properties().Name)
+			}
+
+			// Add node name to list of node names, but only if aggregating
+			// by node. (These are to be used later for attached volumes.)
+			nodeNames[key] = true
+
+			// adjustmentRate is used to scale resource costs proportionally
+			// by the adjustment. This is necessary because we only get one
+			// adjustment per Node, not one per-resource-per-Node.
+			//
+			// e.g. total cost = $90, adjustment = -$10 => 0.9
+			// e.g. total cost = $150, adjustment = -$300 => 0.3333
+			// e.g. total cost = $150, adjustment = $50 => 1.5
+			adjustmentRate := 1.0
+			if node.TotalCost()-node.Adjustment() == 0 {
+				// If (totalCost - adjustment) is 0.0 then adjustment cancels
+				// the entire node cost and we should make everything 0
+				// without dividing by 0.
+				adjustmentRate = 0.0
+				log.DedupedWarningf(5, "ComputeTotals: node cost adjusted to $0.00 for %s", node.Properties().Name)
+			} else if node.Adjustment() != 0.0 {
+				// adjustmentRate is the ratio of cost-with-adjustment (i.e. TotalCost)
+				// to cost-without-adjustment (i.e. TotalCost - Adjustment).
+				adjustmentRate = node.TotalCost() / (node.TotalCost() - node.Adjustment())
+			}
+
+			totalCPUCost := node.CPUCost * (1.0 - node.Discount)
+			cpuCost := totalCPUCost * adjustmentRate
+			cpuCostAdjustment := totalCPUCost - cpuCost
+
+			totalGPUCost := node.GPUCost * (1.0 - node.Discount)
+			gpuCost := totalGPUCost * adjustmentRate
+			gpuCostAdjustment := totalGPUCost - gpuCost
+
+			totalRAMCost := node.RAMCost * (1.0 - node.Discount)
+			ramCost := totalRAMCost * adjustmentRate
+			ramCostAdjustment := totalRAMCost - ramCost
+
+			if _, ok := arts[key]; !ok {
+				arts[key] = &AssetTotals{
+					Start:   node.Start(),
+					End:     node.End(),
+					Cluster: node.Properties().Cluster,
+					Node:    node.Properties().Name,
+				}
+			}
+
+			if arts[key].Start.After(node.Start()) {
+				arts[key].Start = node.Start()
+			}
+			if arts[key].End.Before(node.End()) {
+				arts[key].End = node.End()
+			}
+
+			if arts[key].Node != node.Properties().Name {
+				arts[key].Node = ""
+			}
+
+			arts[key].Count++
+			arts[key].CPUCost += cpuCost
+			arts[key].CPUCostAdjustment += cpuCostAdjustment
+			arts[key].RAMCost += ramCost
+			arts[key].RAMCostAdjustment += ramCostAdjustment
+			arts[key].GPUCost += gpuCost
+			arts[key].GPUCostAdjustment += gpuCostAdjustment
+		} else if disk, ok := asset.(*Disk); ok {
+			key := fmt.Sprintf("%s/%s", disk.Properties().Cluster, disk.Properties().Name)
+			disks[key] = disk
+		} else if cm, ok := asset.(*ClusterManagement); ok && prop == AssetClusterProp {
+			// Only record cluster management when prop is Cluster because we
+			// can't break down ClusterManagement by node.
+			key := cm.Properties().Cluster
+
+			if _, ok := arts[key]; !ok {
+				arts[key] = &AssetTotals{
+					Start:   cm.Start(),
+					End:     cm.End(),
+					Cluster: cm.Properties().Cluster,
+				}
+			}
+
+			arts[key].Count++
+			arts[key].ClusterManagementCost += cm.TotalCost()
+		}
+	})
+
+	// Identify attached volumes as disks with names matching a node's name
+	for name := range nodeNames {
+		if disk, ok := disks[name]; ok {
+			// By default, the key will be the name, which is the tuple of
+			// cluster/node. But if we're aggregating by cluster only, then
+			// reset the key to just the cluster.
+			key := name
+			if prop == AssetClusterProp {
+				key = disk.Properties().Cluster
+			}
+
+			if _, ok := arts[key]; !ok {
+				arts[key] = &AssetTotals{
+					Start:   disk.Start(),
+					End:     disk.End(),
+					Cluster: disk.Properties().Cluster,
+				}
+
+				if prop == AssetNodeProp {
+					arts[key].Node = disk.Properties().Name
+				}
+			}
+
+			arts[key].Count++
+			arts[key].AttachedVolumeCost += disk.TotalCost()
+		}
+	}
+
+	return arts
+}
+
+// ComputeIdleCoefficients returns the idle coefficients for CPU, GPU, and RAM
+// (in that order) for the given resource costs and totals.
+func ComputeIdleCoefficients(shareSplit, key string, cpuCost, gpuCost, ramCost float64, allocationTotals map[string]*AllocationTotals) (float64, float64, float64) {
+	if shareSplit == ShareNone {
+		return 0.0, 0.0, 0.0
+	}
+
+	if shareSplit != ShareEven {
+		shareSplit = ShareWeighted
+	}
+
+	var cpuCoeff, gpuCoeff, ramCoeff float64
+
+	if _, ok := allocationTotals[key]; !ok {
+		return 0.0, 0.0, 0.0
+	}
+
+	if shareSplit == ShareEven {
+		coeff := 1.0 / float64(allocationTotals[key].Count)
+		return coeff, coeff, coeff
+	}
+
+	if allocationTotals[key].CPUCost > 0 {
+		cpuCoeff = cpuCost / allocationTotals[key].CPUCost
+	}
+
+	if allocationTotals[key].GPUCost > 0 {
+		gpuCoeff = cpuCost / allocationTotals[key].GPUCost
+	}
+
+	if allocationTotals[key].RAMCost > 0 {
+		ramCoeff = ramCost / allocationTotals[key].RAMCost
+	}
+
+	return cpuCoeff, gpuCoeff, ramCoeff
+}
+
+// TotalsStore acts as both an AllocationTotalsStore and an
+// AssetTotalsStore.
+type TotalsStore interface {
+	AllocationTotalsStore
+	AssetTotalsStore
+}
+
+// AllocationTotalsStore allows for storing (i.e. setting and
+// getting) AllocationTotals by cluster and by node.
+type AllocationTotalsStore interface {
+	GetAllocationTotalsByCluster(start, end time.Time) (map[string]*AllocationTotals, bool)
+	GetAllocationTotalsByNode(start, end time.Time) (map[string]*AllocationTotals, bool)
+	SetAllocationTotalsByCluster(start, end time.Time, rts map[string]*AllocationTotals)
+	SetAllocationTotalsByNode(start, end time.Time, rts map[string]*AllocationTotals)
+}
+
+// UpdateAllocationTotalsStore updates an AllocationTotalsStore
+// by totaling the given AllocationSet and saving the totals.
+func UpdateAllocationTotalsStore(arts AllocationTotalsStore, as *AllocationSet) error {
+	if arts == nil {
+		return errors.New("cannot update nil AllocationTotalsStore")
+	}
+
+	if as == nil {
+		return errors.New("cannot update AllocationTotalsStore from nil AllocationSet")
+	}
+
+	if as.Window.IsOpen() {
+		return errors.New("cannot update AllocationTotalsStore from AllocationSet with open window")
+	}
+
+	start := *as.Window.Start()
+	end := *as.Window.End()
+
+	artsByCluster := ComputeAllocationTotals(as, AllocationClusterProp)
+	arts.SetAllocationTotalsByCluster(start, end, artsByCluster)
+
+	artsByNode := ComputeAllocationTotals(as, AllocationNodeProp)
+	arts.SetAllocationTotalsByNode(start, end, artsByNode)
+
+	log.Infof("ETL: Allocation: updated resource totals for %s", as.Window)
+
+	return nil
+}
+
+// AssetTotalsStore allows for storing (i.e. setting and getting)
+// AssetTotals by cluster and by node.
+type AssetTotalsStore interface {
+	GetAssetTotalsByCluster(start, end time.Time) (map[string]*AssetTotals, bool)
+	GetAssetTotalsByNode(start, end time.Time) (map[string]*AssetTotals, bool)
+	SetAssetTotalsByCluster(start, end time.Time, rts map[string]*AssetTotals)
+	SetAssetTotalsByNode(start, end time.Time, rts map[string]*AssetTotals)
+}
+
+// UpdateAssetTotalsStore updates an AssetTotalsStore
+// by totaling the given AssetSet and saving the totals.
+func UpdateAssetTotalsStore(arts AssetTotalsStore, as *AssetSet) error {
+	if arts == nil {
+		return errors.New("cannot update nil AssetTotalsStore")
+	}
+
+	if as == nil {
+		return errors.New("cannot update AssetTotalsStore from nil AssetSet")
+	}
+
+	if as.Window.IsOpen() {
+		return errors.New("cannot update AssetTotalsStore from AssetSet with open window")
+	}
+
+	start := *as.Window.Start()
+	end := *as.Window.End()
+
+	artsByCluster := ComputeAssetTotals(as, AssetClusterProp)
+	arts.SetAssetTotalsByCluster(start, end, artsByCluster)
+
+	artsByNode := ComputeAssetTotals(as, AssetNodeProp)
+	arts.SetAssetTotalsByNode(start, end, artsByNode)
+
+	log.Infof("ETL: Asset: updated resource totals for %s", as.Window)
+
+	return nil
+}
+
+// MemoryTotalsStore is an in-memory cache TotalsStore
+type MemoryTotalsStore struct {
+	allocTotalsByCluster *cache.Cache
+	allocTotalsByNode    *cache.Cache
+	assetTotalsByCluster *cache.Cache
+	assetTotalsByNode    *cache.Cache
+}
+
+// NewMemoryTotalsStore instantiates a new MemoryTotalsStore,
+// which is composed of four in-memory caches.
+func NewMemoryTotalsStore() *MemoryTotalsStore {
+	return &MemoryTotalsStore{
+		allocTotalsByCluster: cache.New(cache.NoExpiration, cache.NoExpiration),
+		allocTotalsByNode:    cache.New(cache.NoExpiration, cache.NoExpiration),
+		assetTotalsByCluster: cache.New(cache.NoExpiration, cache.NoExpiration),
+		assetTotalsByNode:    cache.New(cache.NoExpiration, cache.NoExpiration),
+	}
+}
+
+// GetAllocationTotalsByCluster retrieves the AllocationTotals
+// by cluster for the given start and end times.
+func (mts *MemoryTotalsStore) GetAllocationTotalsByCluster(start time.Time, end time.Time) (map[string]*AllocationTotals, bool) {
+	k := storeKey(start, end)
+	if raw, ok := mts.allocTotalsByCluster.Get(k); ok {
+		return raw.(map[string]*AllocationTotals), true
+	} else {
+		return map[string]*AllocationTotals{}, false
+	}
+}
+
+// GetAllocationTotalsByNode retrieves the AllocationTotals
+// by node for the given start and end times.
+func (mts *MemoryTotalsStore) GetAllocationTotalsByNode(start time.Time, end time.Time) (map[string]*AllocationTotals, bool) {
+	k := storeKey(start, end)
+	if raw, ok := mts.allocTotalsByNode.Get(k); ok {
+		return raw.(map[string]*AllocationTotals), true
+	} else {
+		return map[string]*AllocationTotals{}, false
+	}
+}
+
+// SetAllocationTotalsByCluster set the per-cluster AllocationTotals
+// to the given values for the given start and end times.
+func (mts *MemoryTotalsStore) SetAllocationTotalsByCluster(start time.Time, end time.Time, arts map[string]*AllocationTotals) {
+	k := storeKey(start, end)
+	mts.allocTotalsByCluster.Set(k, arts, cache.NoExpiration)
+}
+
+// SetAllocationTotalsByNode set the per-node AllocationTotals
+// to the given values for the given start and end times.
+func (mts *MemoryTotalsStore) SetAllocationTotalsByNode(start time.Time, end time.Time, arts map[string]*AllocationTotals) {
+	k := storeKey(start, end)
+	mts.allocTotalsByNode.Set(k, arts, cache.NoExpiration)
+}
+
+// GetAssetTotalsByCluster retrieves the AssetTotals
+// by cluster for the given start and end times.
+func (mts *MemoryTotalsStore) GetAssetTotalsByCluster(start time.Time, end time.Time) (map[string]*AssetTotals, bool) {
+	k := storeKey(start, end)
+	if raw, ok := mts.assetTotalsByCluster.Get(k); ok {
+		return raw.(map[string]*AssetTotals), true
+	} else {
+		return map[string]*AssetTotals{}, false
+	}
+}
+
+// GetAssetTotalsByNode retrieves the AssetTotals
+// by node for the given start and end times.
+func (mts *MemoryTotalsStore) GetAssetTotalsByNode(start time.Time, end time.Time) (map[string]*AssetTotals, bool) {
+	k := storeKey(start, end)
+	if raw, ok := mts.assetTotalsByNode.Get(k); ok {
+		return raw.(map[string]*AssetTotals), true
+	} else {
+		return map[string]*AssetTotals{}, false
+	}
+}
+
+// SetAssetTotalsByCluster set the per-cluster AssetTotals
+// to the given values for the given start and end times.
+func (mts *MemoryTotalsStore) SetAssetTotalsByCluster(start time.Time, end time.Time, arts map[string]*AssetTotals) {
+	k := storeKey(start, end)
+	mts.assetTotalsByCluster.Set(k, arts, cache.NoExpiration)
+}
+
+// SetAssetTotalsByNode set the per-node AssetTotals
+// to the given values for the given start and end times.
+func (mts *MemoryTotalsStore) SetAssetTotalsByNode(start time.Time, end time.Time, arts map[string]*AssetTotals) {
+	k := storeKey(start, end)
+	mts.assetTotalsByNode.Set(k, arts, cache.NoExpiration)
+}
+
+// storeKey creates a storage key based on start and end times
+func storeKey(start, end time.Time) string {
+	startStr := strconv.FormatInt(start.Unix(), 10)
+	endStr := strconv.FormatInt(end.Unix(), 10)
+	return fmt.Sprintf("%s-%s", startStr, endStr)
+}

+ 38 - 3
pkg/kubecost/window.go

@@ -3,12 +3,13 @@ package kubecost
 import (
 	"bytes"
 	"fmt"
-	"github.com/kubecost/cost-model/pkg/util/timeutil"
 	"math"
 	"regexp"
 	"strconv"
 	"time"
 
+	"github.com/kubecost/cost-model/pkg/util/timeutil"
+
 	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/thanos"
 )
@@ -410,6 +411,32 @@ func (w Window) Expand(that Window) Window {
 	return w
 }
 
+func (w Window) ContractStart(start time.Time) Window {
+	if w.start == nil || start.After(*w.start) {
+		w.start = &start
+	}
+	return w
+}
+
+func (w Window) ContractEnd(end time.Time) Window {
+	if w.end == nil || end.Before(*w.end) {
+		w.end = &end
+	}
+	return w
+}
+
+func (w Window) Contract(that Window) Window {
+	if that.start != nil {
+		w = w.ContractStart(*that.start)
+	}
+
+	if that.end != nil {
+		w = w.ContractEnd(*that.end)
+	}
+
+	return w
+}
+
 func (w Window) Hours() float64 {
 	if w.IsOpen() {
 		return math.Inf(1)
@@ -433,8 +460,16 @@ func (w Window) IsOpen() bool {
 // TODO:CLEANUP make this unmarshalable (make Start and End public)
 func (w Window) MarshalJSON() ([]byte, error) {
 	buffer := bytes.NewBufferString("{")
-	buffer.WriteString(fmt.Sprintf("\"start\":\"%s\",", w.start.Format(time.RFC3339)))
-	buffer.WriteString(fmt.Sprintf("\"end\":\"%s\"", w.end.Format(time.RFC3339)))
+	if w.start != nil {
+		buffer.WriteString(fmt.Sprintf("\"start\":\"%s\",", w.start.Format(time.RFC3339)))
+	} else {
+		buffer.WriteString(fmt.Sprintf("\"start\":\"%s\",", "null"))
+	}
+	if w.end != nil {
+		buffer.WriteString(fmt.Sprintf("\"end\":\"%s\"", w.end.Format(time.RFC3339)))
+	} else {
+		buffer.WriteString(fmt.Sprintf("\"end\":\"%s\"", "null"))
+	}
 	buffer.WriteString("}")
 	return buffer.Bytes(), nil
 }

+ 15 - 0
pkg/prom/query.go

@@ -8,6 +8,7 @@ import (
 	"strconv"
 	"time"
 
+	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/errors"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/util/httputil"
@@ -21,6 +22,10 @@ const (
 	epQueryRange = apiPrefix + "/query_range"
 )
 
+// prometheus query offset to apply to each non-range query
+// package scope to prevent calling duration parse each use
+var promQueryOffset time.Duration = env.GetPrometheusQueryOffset()
+
 // Context wraps a Prometheus client and provides methods for querying and
 // parsing query responses and errors.
 type Context struct {
@@ -171,6 +176,16 @@ func (ctx *Context) RawQuery(query string) ([]byte, error) {
 	u := ctx.Client.URL(epQuery, nil)
 	q := u.Query()
 	q.Set("query", query)
+
+	// for non-range queries, we set the timestamp for the query to time-offset
+	// this is a special use case that's typically only used when our primary
+	// prom db has delayed insertion (thanos, cortex, etc...)
+	if promQueryOffset != 0 && ctx.name != AllocationContextName {
+		q.Set("time", time.Now().Add(-promQueryOffset).UTC().Format(time.RFC3339))
+	} else {
+		q.Set("time", time.Now().UTC().Format(time.RFC3339))
+	}
+
 	u.RawQuery = q.Encode()
 
 	req, err := http.NewRequest(http.MethodPost, u.String(), nil)

+ 58 - 0
pkg/storage/bucketstorage.go

@@ -0,0 +1,58 @@
+package storage
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/pkg/errors"
+	"gopkg.in/yaml.v2"
+)
+
+// StorageProvider is the type of provider used for storage if not leveraging a file implementation.
+type StorageProvider string
+
+const (
+	S3 StorageProvider = "S3"
+	// AZURE StorageProvider = "AZURE"
+	// GCS   StorageProvider = "GCS"
+)
+
+// StorageConfig is the configuration type used as the "parent" configuration. It contains a type, which will
+// specify the bucket storage implementation, and a configuration object specific to that storage implementation.
+type StorageConfig struct {
+	Type   StorageProvider `yaml:"type"`
+	Config interface{}     `yaml:"config"`
+}
+
+// NewBucketStorage initializes and returns new Storage implementation leveraging the storage provider
+// configuration. This configuration type uses the layout provided in thanos: https://thanos.io/tip/thanos/storage.md/
+func NewBucketStorage(config []byte) (Storage, error) {
+	storageConfig := &StorageConfig{}
+	if err := yaml.UnmarshalStrict(config, storageConfig); err != nil {
+		return nil, errors.Wrap(err, "parsing config YAML file")
+	}
+
+	// Because the Config property is specific to the storage implementation, we'll marshal back into yaml, and allow
+	// the specific implementation to unmarshal back into a concrete configuration type.
+	config, err := yaml.Marshal(storageConfig.Config)
+	if err != nil {
+		return nil, errors.Wrap(err, "marshal content of storage configuration")
+	}
+
+	var storage Storage
+	switch strings.ToUpper(string(storageConfig.Type)) {
+	case string(S3):
+		storage, err = NewS3Storage(config)
+	//case string(GCS):
+	//	storage, err = NewGCSStorage(config)
+	//case string(AZURE):
+	//	storage, err = NewAzureStorage(config)
+	default:
+		return nil, errors.Errorf("storage with type %s is not supported", storageConfig.Type)
+	}
+	if err != nil {
+		return nil, errors.Wrap(err, fmt.Sprintf("create %s client", storageConfig.Type))
+	}
+
+	return storage, nil
+}

+ 139 - 0
pkg/storage/filestorage.go

@@ -0,0 +1,139 @@
+package storage
+
+import (
+	gofs "io/fs"
+	"io/ioutil"
+	"os"
+	gopath "path"
+	"path/filepath"
+
+	"github.com/kubecost/cost-model/pkg/util/fileutil"
+	"github.com/pkg/errors"
+)
+
+// FileStorage leverages the file system to write data to disk.
+type FileStorage struct {
+	baseDir string
+}
+
+// NewFileStorage returns a new storage API which leverages the file system.
+func NewFileStorage(baseDir string) Storage {
+	return &FileStorage{baseDir}
+}
+
+// FullPath returns the storage working path combined with the path provided
+func (fs *FileStorage) FullPath(path string) string {
+	return gopath.Join(fs.baseDir, path)
+}
+
+// Stat returns the StorageStats for the specific path.
+func (fs *FileStorage) Stat(path string) (*StorageInfo, error) {
+	f := gopath.Join(fs.baseDir, path)
+	st, err := os.Stat(f)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, DoesNotExistError
+		}
+
+		return nil, errors.Wrap(err, "Failed to stat file")
+	}
+
+	return FileToStorageInfo(st), nil
+}
+
+// List uses the relative path of the storage combined with the provided path to return
+// storage information for the files.
+func (fs *FileStorage) List(path string) ([]*StorageInfo, error) {
+	p := gopath.Join(fs.baseDir, path)
+
+	// Read files in the backup path
+	files, err := ioutil.ReadDir(p)
+	if err != nil {
+		return nil, err
+	}
+
+	return FilesToStorageInfo(files), nil
+}
+
+// Read uses the relative path of the storage combined with the provided path to
+// read the contents.
+func (fs *FileStorage) Read(path string) ([]byte, error) {
+	f := gopath.Join(fs.baseDir, path)
+
+	b, err := ioutil.ReadFile(f)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, DoesNotExistError
+		}
+		return nil, errors.Wrap(err, "Failed to read file")
+	}
+
+	return b, nil
+}
+
+// Write uses the relative path of the storage combined with the provided path
+// to write a new file or overwrite an existing file.
+func (fs *FileStorage) Write(path string, data []byte) error {
+	f := fs.prepare(path)
+
+	err := ioutil.WriteFile(f, data, os.ModePerm)
+	if err != nil {
+		return errors.Wrap(err, "Failed to write file")
+	}
+
+	return nil
+}
+
+// Remove uses the relative path of the storage combined with the provided path to
+// remove a file from storage permanently.
+func (fs *FileStorage) Remove(path string) error {
+	f := gopath.Join(fs.baseDir, path)
+
+	err := os.Remove(f)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return DoesNotExistError
+		}
+
+		return errors.Wrap(err, "Failed to remove file")
+	}
+
+	return nil
+}
+
+// Exists uses the relative path of the storage combined with the provided path to
+// determine if the file exists.
+func (fs *FileStorage) Exists(path string) (bool, error) {
+	f := gopath.Join(fs.baseDir, path)
+	return fileutil.FileExists(f)
+}
+
+// prepare checks to see if the directory being written to should be created before writing
+// the file, and then returns the correct full path.
+func (fs *FileStorage) prepare(path string) string {
+	f := gopath.Join(fs.baseDir, path)
+	dir := filepath.Dir(f)
+	if _, e := os.Stat(dir); e != nil && os.IsNotExist(e) {
+		os.MkdirAll(dir, os.ModePerm)
+	}
+
+	return f
+}
+
+// FilesToStorageInfo maps a []fs.FileInfo to []*storage.StorageInfo
+func FilesToStorageInfo(fileInfo []gofs.FileInfo) []*StorageInfo {
+	var stats []*StorageInfo
+	for _, info := range fileInfo {
+		stats = append(stats, FileToStorageInfo(info))
+	}
+	return stats
+}
+
+// FileToStorageInfo maps a fs.FileInfo to *storage.StorageInfo
+func FileToStorageInfo(fileInfo gofs.FileInfo) *StorageInfo {
+	return &StorageInfo{
+		Name:    fileInfo.Name(),
+		Size:    fileInfo.Size(),
+		ModTime: fileInfo.ModTime(),
+	}
+}

+ 543 - 0
pkg/storage/s3storage.go

@@ -0,0 +1,543 @@
+// Fork from Thanos S3 Bucket support to reuse configuration options
+// Licensed under the Apache License 2.0
+// https://github.com/thanos-io/thanos/blob/main/pkg/objstore/s3/s3.go
+package storage
+
+import (
+	"bytes"
+	"context"
+	"crypto/tls"
+	"io/ioutil"
+	"net"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/log"
+
+	"github.com/minio/minio-go/v7"
+	"github.com/minio/minio-go/v7/pkg/credentials"
+	"github.com/minio/minio-go/v7/pkg/encrypt"
+	"github.com/pkg/errors"
+
+	"gopkg.in/yaml.v2"
+)
+
+type ctxKey int
+
+const (
+	// DirDelim is the delimiter used to model a directory structure in an object store bucket.
+	DirDelim = "/"
+
+	// SSEKMS is the name of the SSE-KMS method for objectstore encryption.
+	SSEKMS = "SSE-KMS"
+
+	// SSEC is the name of the SSE-C method for objstore encryption.
+	SSEC = "SSE-C"
+
+	// SSES3 is the name of the SSE-S3 method for objstore encryption.
+	SSES3 = "SSE-S3"
+
+	// sseConfigKey is the context key to override SSE config. This feature is used by downstream
+	// projects (eg. Cortex) to inject custom SSE config on a per-request basis. Future work or
+	// refactoring can introduce breaking changes as far as the functionality is preserved.
+	// NOTE: we're using a context value only because it's a very specific S3 option. If SSE will
+	// be available to wider set of backends we should probably add a variadic option to Get() and Upload().
+	sseConfigKey = ctxKey(0)
+)
+
+var DefaultConfig = S3Config{
+	PutUserMetadata: map[string]string{},
+	HTTPConfig: HTTPConfig{
+		IdleConnTimeout:       time.Duration(90 * time.Second),
+		ResponseHeaderTimeout: time.Duration(2 * time.Minute),
+		TLSHandshakeTimeout:   time.Duration(10 * time.Second),
+		ExpectContinueTimeout: time.Duration(1 * time.Second),
+		MaxIdleConns:          100,
+		MaxIdleConnsPerHost:   100,
+		MaxConnsPerHost:       0,
+	},
+	PartSize: 1024 * 1024 * 64, // 64Ms3.
+}
+
+// Config stores the configuration for s3 bucket.
+type S3Config struct {
+	Bucket             string            `yaml:"bucket"`
+	Endpoint           string            `yaml:"endpoint"`
+	Region             string            `yaml:"region"`
+	AccessKey          string            `yaml:"access_key"`
+	Insecure           bool              `yaml:"insecure"`
+	SignatureV2        bool              `yaml:"signature_version2"`
+	SecretKey          string            `yaml:"secret_key"`
+	PutUserMetadata    map[string]string `yaml:"put_user_metadata"`
+	HTTPConfig         HTTPConfig        `yaml:"http_config"`
+	TraceConfig        TraceConfig       `yaml:"trace"`
+	ListObjectsVersion string            `yaml:"list_objects_version"`
+	// PartSize used for multipart upload. Only used if uploaded object size is known and larger than configured PartSize.
+	// NOTE we need to make sure this number does not produce more parts than 10 000.
+	PartSize  uint64    `yaml:"part_size"`
+	SSEConfig SSEConfig `yaml:"sse_config"`
+}
+
+// SSEConfig deals with the configuration of SSE for Minio. The following options are valid:
+// kmsencryptioncontext == https://docs.aws.amazon.com/kms/latest/developerguide/services-s3.html#s3-encryption-context
+type SSEConfig struct {
+	Type                 string            `yaml:"type"`
+	KMSKeyID             string            `yaml:"kms_key_id"`
+	KMSEncryptionContext map[string]string `yaml:"kms_encryption_context"`
+	EncryptionKey        string            `yaml:"encryption_key"`
+}
+
+type TraceConfig struct {
+	Enable bool `yaml:"enable"`
+}
+
+// HTTPConfig stores the http.Transport configuration for the s3 minio client.
+type HTTPConfig struct {
+	IdleConnTimeout       time.Duration `yaml:"idle_conn_timeout"`
+	ResponseHeaderTimeout time.Duration `yaml:"response_header_timeout"`
+	InsecureSkipVerify    bool          `yaml:"insecure_skip_verify"`
+
+	TLSHandshakeTimeout   time.Duration `yaml:"tls_handshake_timeout"`
+	ExpectContinueTimeout time.Duration `yaml:"expect_continue_timeout"`
+	MaxIdleConns          int           `yaml:"max_idle_conns"`
+	MaxIdleConnsPerHost   int           `yaml:"max_idle_conns_per_host"`
+	MaxConnsPerHost       int           `yaml:"max_conns_per_host"`
+
+	// Allow upstream callers to inject a round tripper
+	Transport http.RoundTripper `yaml:"-"`
+}
+
+// DefaultTransport - this default transport is based on the Minio
+// DefaultTransport up until the following commit:
+// https://githus3.com/minio/minio-go/commit/008c7aa71fc17e11bf980c209a4f8c4d687fc884
+// The values have since diverged.
+func DefaultTransport(config S3Config) *http.Transport {
+	return &http.Transport{
+		Proxy: http.ProxyFromEnvironment,
+		DialContext: (&net.Dialer{
+			Timeout:   30 * time.Second,
+			KeepAlive: 30 * time.Second,
+			DualStack: true,
+		}).DialContext,
+
+		MaxIdleConns:          config.HTTPConfig.MaxIdleConns,
+		MaxIdleConnsPerHost:   config.HTTPConfig.MaxIdleConnsPerHost,
+		IdleConnTimeout:       time.Duration(config.HTTPConfig.IdleConnTimeout),
+		MaxConnsPerHost:       config.HTTPConfig.MaxConnsPerHost,
+		TLSHandshakeTimeout:   time.Duration(config.HTTPConfig.TLSHandshakeTimeout),
+		ExpectContinueTimeout: time.Duration(config.HTTPConfig.ExpectContinueTimeout),
+		// A custom ResponseHeaderTimeout was introduced
+		// to cover cases where the tcp connection works but
+		// the server never answers. Defaults to 2 minutes.
+		ResponseHeaderTimeout: time.Duration(config.HTTPConfig.ResponseHeaderTimeout),
+		// Set this value so that the underlying transport round-tripper
+		// doesn't try to auto decode the body of objects with
+		// content-encoding set to `gzip`.
+		//
+		// Refer: https://golang.org/src/net/http/transport.go?h=roundTrip#L1843.
+		DisableCompression: true,
+		// #nosec It's up to the user to decide on TLS configs
+		TLSClientConfig: &tls.Config{InsecureSkipVerify: config.HTTPConfig.InsecureSkipVerify},
+	}
+}
+
+// S3Storage provides storage via S3
+type S3Storage struct {
+	name            string
+	client          *minio.Client
+	defaultSSE      encrypt.ServerSide
+	putUserMetadata map[string]string
+	partSize        uint64
+	listObjectsV1   bool
+}
+
+// parseConfig unmarshals a buffer into a Config with default HTTPConfig values.
+func parseConfig(conf []byte) (S3Config, error) {
+	config := DefaultConfig
+	if err := yaml.UnmarshalStrict(conf, &config); err != nil {
+		return S3Config{}, err
+	}
+
+	return config, nil
+}
+
+// NewBucket returns a new Bucket using the provided s3 config values.
+func NewS3Storage(conf []byte) (*S3Storage, error) {
+	log.Infof("Creating new S3 Storage...")
+
+	config, err := parseConfig(conf)
+	if err != nil {
+		return nil, err
+	}
+
+	return NewS3StorageWith(config)
+}
+
+// NewBucketWithConfig returns a new Bucket using the provided s3 config values.
+func NewS3StorageWith(config S3Config) (*S3Storage, error) {
+	var chain []credentials.Provider
+
+	log.Infof("New S3 Storage With Config: %+v", config)
+
+	wrapCredentialsProvider := func(p credentials.Provider) credentials.Provider { return p }
+	if config.SignatureV2 {
+		wrapCredentialsProvider = func(p credentials.Provider) credentials.Provider {
+			return &overrideSignerType{Provider: p, signerType: credentials.SignatureV2}
+		}
+	}
+
+	if err := validate(config); err != nil {
+		return nil, err
+	}
+	if config.AccessKey != "" {
+		chain = []credentials.Provider{wrapCredentialsProvider(&credentials.Static{
+			Value: credentials.Value{
+				AccessKeyID:     config.AccessKey,
+				SecretAccessKey: config.SecretKey,
+				SignerType:      credentials.SignatureV4,
+			},
+		})}
+	} else {
+		chain = []credentials.Provider{
+			wrapCredentialsProvider(&credentials.EnvAWS{}),
+			wrapCredentialsProvider(&credentials.FileAWSCredentials{}),
+			wrapCredentialsProvider(&credentials.IAM{
+				Client: &http.Client{
+					Transport: http.DefaultTransport,
+				},
+			}),
+		}
+	}
+
+	// Check if a roundtripper has been set in the config
+	// otherwise build the default transport.
+	var rt http.RoundTripper
+	if config.HTTPConfig.Transport != nil {
+		rt = config.HTTPConfig.Transport
+	} else {
+		rt = DefaultTransport(config)
+	}
+
+	client, err := minio.New(config.Endpoint, &minio.Options{
+		Creds:     credentials.NewChainCredentials(chain),
+		Secure:    !config.Insecure,
+		Region:    config.Region,
+		Transport: rt,
+	})
+	if err != nil {
+		return nil, errors.Wrap(err, "initialize s3 client")
+	}
+
+	var sse encrypt.ServerSide
+	if config.SSEConfig.Type != "" {
+		switch config.SSEConfig.Type {
+		case SSEKMS:
+			sse, err = encrypt.NewSSEKMS(config.SSEConfig.KMSKeyID, config.SSEConfig.KMSEncryptionContext)
+			if err != nil {
+				return nil, errors.Wrap(err, "initialize s3 client SSE-KMS")
+			}
+
+		case SSEC:
+			key, err := ioutil.ReadFile(config.SSEConfig.EncryptionKey)
+			if err != nil {
+				return nil, err
+			}
+
+			sse, err = encrypt.NewSSEC(key)
+			if err != nil {
+				return nil, errors.Wrap(err, "initialize s3 client SSE-C")
+			}
+
+		case SSES3:
+			sse = encrypt.NewSSE()
+
+		default:
+			sseErrMsg := errors.Errorf("Unsupported type %q was provided. Supported types are SSE-S3, SSE-KMS, SSE-C", config.SSEConfig.Type)
+			return nil, errors.Wrap(sseErrMsg, "Initialize s3 client SSE Config")
+		}
+	}
+
+	if config.ListObjectsVersion != "" && config.ListObjectsVersion != "v1" && config.ListObjectsVersion != "v2" {
+		return nil, errors.Errorf("Initialize s3 client list objects version: Unsupported version %q was provided. Supported values are v1, v2", config.ListObjectsVersion)
+	}
+
+	bkt := &S3Storage{
+		name:            config.Bucket,
+		client:          client,
+		defaultSSE:      sse,
+		putUserMetadata: config.PutUserMetadata,
+		partSize:        config.PartSize,
+		listObjectsV1:   config.ListObjectsVersion == "v1",
+	}
+	return bkt, nil
+}
+
+// Name returns the bucket name for s3.
+func (s3 *S3Storage) Name() string {
+	return s3.name
+}
+
+// validate checks to see the config options are set.
+func validate(conf S3Config) error {
+	if conf.Endpoint == "" {
+		return errors.New("no s3 endpoint in config file")
+	}
+
+	if conf.AccessKey == "" && conf.SecretKey != "" {
+		return errors.New("no s3 acccess_key specified while secret_key is present in config file; either both should be present in config or envvars/IAM should be used.")
+	}
+
+	if conf.AccessKey != "" && conf.SecretKey == "" {
+		return errors.New("no s3 secret_key specified while access_key is present in config file; either both should be present in config or envvars/IAM should be used.")
+	}
+
+	if conf.SSEConfig.Type == SSEC && conf.SSEConfig.EncryptionKey == "" {
+		return errors.New("encryption_key must be set if sse_config.type is set to 'SSE-C'")
+	}
+
+	if conf.SSEConfig.Type == SSEKMS && conf.SSEConfig.KMSKeyID == "" {
+		return errors.New("kms_key_id must be set if sse_config.type is set to 'SSE-KMS'")
+	}
+
+	return nil
+}
+
+// FullPath returns the storage working path combined with the path provided
+func (s3 *S3Storage) FullPath(name string) string {
+	name = s3.trimLeading(name)
+
+	return name
+}
+
+// Get returns a reader for the given object name.
+func (s3 *S3Storage) Read(name string) ([]byte, error) {
+	name = s3.trimLeading(name)
+
+	log.Infof("S3Storage::Read(%s)", name)
+	ctx := context.Background()
+
+	return s3.getRange(ctx, name, 0, -1)
+
+}
+
+// Exists checks if the given object exists.
+func (s3 *S3Storage) Exists(name string) (bool, error) {
+	name = s3.trimLeading(name)
+	//log.Infof("S3Storage::Exists(%s)", name)
+
+	ctx := context.Background()
+
+	_, err := s3.client.StatObject(ctx, s3.name, name, minio.StatObjectOptions{})
+	if err != nil {
+		if s3.isDoesNotExist(err) {
+			return false, nil
+		}
+		return false, errors.Wrap(err, "stat s3 object")
+	}
+
+	return true, nil
+}
+
+// Upload the contents of the reader as an object into the bucket.
+func (s3 *S3Storage) Write(name string, data []byte) error {
+	name = s3.trimLeading(name)
+
+	log.Infof("S3Storage::Write(%s)", name)
+
+	ctx := context.Background()
+	sse, err := s3.getServerSideEncryption(ctx)
+	if err != nil {
+		return err
+	}
+
+	var size int64 = int64(len(data))
+	var partSize uint64 = 0
+
+	r := bytes.NewReader(data)
+	_, err = s3.client.PutObject(ctx, s3.name, name, r, int64(size), minio.PutObjectOptions{
+		PartSize:             partSize,
+		ServerSideEncryption: sse,
+		UserMetadata:         s3.putUserMetadata,
+	})
+
+	if err != nil {
+		return errors.Wrap(err, "upload s3 object")
+	}
+
+	return nil
+}
+
+// Attributes returns information about the specified object.
+func (s3 *S3Storage) Stat(name string) (*StorageInfo, error) {
+	name = s3.trimLeading(name)
+
+	//log.Infof("S3Storage::Stat(%s)", name)
+	ctx := context.Background()
+
+	objInfo, err := s3.client.StatObject(ctx, s3.name, name, minio.StatObjectOptions{})
+	if err != nil {
+		if s3.isDoesNotExist(err) {
+			return nil, DoesNotExistError
+		}
+		return nil, err
+	}
+
+	return &StorageInfo{
+		Name:    s3.trimName(name),
+		Size:    objInfo.Size,
+		ModTime: objInfo.LastModified,
+	}, nil
+}
+
+// Delete removes the object with the given name.
+func (s3 *S3Storage) Remove(name string) error {
+	name = s3.trimLeading(name)
+
+	log.Infof("S3Storage::Remove(%s)", name)
+	ctx := context.Background()
+
+	return s3.client.RemoveObject(ctx, s3.name, name, minio.RemoveObjectOptions{})
+}
+
+func (s3 *S3Storage) List(path string) ([]*StorageInfo, error) {
+	path = s3.trimLeading(path)
+
+	log.Infof("S3Storage::List(%s)", path)
+	ctx := context.Background()
+
+	// Ensure the object name actually ends with a dir suffix. Otherwise we'll just iterate the
+	// object itself as one prefix item.
+	if path != "" {
+		path = strings.TrimSuffix(path, DirDelim) + DirDelim
+	}
+
+	opts := minio.ListObjectsOptions{
+		Prefix:    path,
+		Recursive: false,
+		UseV1:     s3.listObjectsV1,
+	}
+
+	var stats []*StorageInfo
+	for object := range s3.client.ListObjects(ctx, s3.name, opts) {
+		// Catch the error when failed to list objects.
+		if object.Err != nil {
+			return nil, object.Err
+		}
+		// This sometimes happens with empty buckets.
+		if object.Key == "" {
+			continue
+		}
+		// The s3 client can also return the directory itself in the ListObjects call above.
+		if object.Key == path {
+			continue
+		}
+
+		stats = append(stats, &StorageInfo{
+			Name:    s3.trimName(object.Key),
+			Size:    object.Size,
+			ModTime: object.LastModified,
+		})
+	}
+
+	return stats, nil
+}
+
+// trimLeading removes a leading / from the file name
+func (s3 *S3Storage) trimLeading(file string) string {
+	if len(file) == 0 {
+		return file
+	}
+
+	if file[0] == '/' {
+		return file[1:]
+	}
+	return file
+}
+
+// trimName removes the leading directory prefix
+func (s3 *S3Storage) trimName(file string) string {
+	slashIndex := strings.LastIndex(file, "/")
+	if slashIndex < 0 {
+		return file
+	}
+
+	name := file[slashIndex+1:]
+	return name
+}
+
+// getServerSideEncryption returns the SSE to use.
+func (s3 *S3Storage) getServerSideEncryption(ctx context.Context) (encrypt.ServerSide, error) {
+	if value := ctx.Value(sseConfigKey); value != nil {
+		if sse, ok := value.(encrypt.ServerSide); ok {
+			return sse, nil
+		}
+		return nil, errors.New("invalid SSE config override provided in the context")
+	}
+
+	return s3.defaultSSE, nil
+}
+
+// isDoesNotExist returns true if error means that object key is not found.
+func (s3 *S3Storage) isDoesNotExist(err error) bool {
+	return minio.ToErrorResponse(errors.Cause(err)).Code == "NoSuchKey"
+}
+
+// isObjNotFound returns true if the error means that the object was not found
+func (s3 *S3Storage) isObjNotFound(err error) bool {
+	return minio.ToErrorResponse(errors.Cause(err)).Code == "NotFoundObject"
+}
+
+func (s3 *S3Storage) getRange(ctx context.Context, name string, off, length int64) ([]byte, error) {
+	sse, err := s3.getServerSideEncryption(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	opts := &minio.GetObjectOptions{ServerSideEncryption: sse}
+	if length != -1 {
+		if err := opts.SetRange(off, off+length-1); err != nil {
+			return nil, err
+		}
+	} else if off > 0 {
+		if err := opts.SetRange(off, 0); err != nil {
+			return nil, err
+		}
+	}
+	r, err := s3.client.GetObject(ctx, s3.name, name, *opts)
+	if err != nil {
+		if s3.isObjNotFound(err) {
+			return nil, DoesNotExistError
+		}
+		return nil, err
+	}
+
+	// NotFoundObject error is revealed only after first Read. This does the initial GetRequest. Prefetch this here
+	// for convenience.
+	if _, err := r.Read(nil); err != nil {
+		r.Close()
+		if s3.isObjNotFound(err) {
+			return nil, DoesNotExistError
+		}
+
+		return nil, errors.Wrap(err, "Read from S3 failed")
+	}
+
+	return ioutil.ReadAll(r)
+}
+
+type overrideSignerType struct {
+	credentials.Provider
+	signerType credentials.SignatureType
+}
+
+func (s *overrideSignerType) Retrieve() (credentials.Value, error) {
+	v, err := s.Provider.Retrieve()
+	if err != nil {
+		return v, err
+	}
+	if !v.SignerType.IsAnonymous() {
+		v.SignerType = s.signerType
+	}
+	return v, nil
+}

+ 55 - 0
pkg/storage/storage.go

@@ -0,0 +1,55 @@
+package storage
+
+import (
+	"os"
+	"time"
+)
+
+// DoesNotExistError is used as a generic error to return when a target path does not
+// exist in storage. Equivalent to os.ErrorNotExist such that it will work with os.IsNotExist(err)
+var DoesNotExistError = os.ErrNotExist
+
+// StorageInfo is a data object containing basic information about the path in storage.
+type StorageInfo struct {
+	Name    string    // base name of the file
+	Size    int64     // length in bytes for regular files
+	ModTime time.Time // modification time
+}
+
+// Storage provides an API for storing binary data
+type Storage interface {
+	// FullPath returns the storage working path combined with the path provided
+	FullPath(path string) string
+
+	// Stat returns the StorageStats for the specific path.
+	Stat(path string) (*StorageInfo, error)
+
+	// Read uses the relative path of the storage combined with the provided path to
+	// read the contents.
+	Read(path string) ([]byte, error)
+
+	// Write uses the relative path of the storage combined with the provided path
+	// to write a new file or overwrite an existing file.
+	Write(path string, data []byte) error
+
+	// Remove uses the relative path of the storage combined with the provided path to
+	// remove a file from storage permanently.
+	Remove(path string) error
+
+	// Exists uses the relative path of the storage combined with the provided path to
+	// determine if the file exists.
+	Exists(path string) (bool, error)
+
+	// List uses the relative path of the storage combined with the provided path to return
+	// storage information for the files.
+	List(path string) ([]*StorageInfo, error)
+}
+
+// IsNotExist returns true if the error provided from a storage object is DoesNotExist
+func IsNotExist(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	return err.Error() == DoesNotExistError.Error()
+}

+ 87 - 0
pkg/util/atomic/atomicrunstate.go

@@ -0,0 +1,87 @@
+package atomic
+
+import (
+	"sync"
+)
+
+// AtomicRunState can be used to provide thread-safe start/stop functionality to internal run-loops
+// inside a goroutine.
+type AtomicRunState struct {
+	lock     sync.Mutex
+	stopping bool
+	stop     chan struct{}
+	reset    chan struct{}
+}
+
+// Start checks for an existing run state and returns false if the run state has already started. If
+// the run state has not started, then it will advance to the started state and return true.
+func (ars *AtomicRunState) Start() bool {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	if ars.stop != nil {
+		return false
+	}
+
+	ars.stop = make(chan struct{})
+	return true
+}
+
+// OnStop returns a channel that should be used within a select goroutine run loop. It is set to signal
+// whenever Stop() is executed. Once the channel is signaled, Reset() should be called if the runstate
+// is to be used again.
+func (ars *AtomicRunState) OnStop() <-chan struct{} {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	return ars.stop
+}
+
+// Stops closes the stop channel triggering any selects waiting for OnStop()
+func (ars *AtomicRunState) Stop() bool {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	if !ars.stopping && ars.stop != nil {
+		ars.stopping = true
+		ars.reset = make(chan struct{})
+		close(ars.stop)
+		return true
+	}
+
+	return false
+}
+
+// Reset should be called in the select case for OnStop(). Note that calling Reset() prior to
+// selecting OnStop() will result in failed Stop signal receive.
+func (ars *AtomicRunState) Reset() {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	close(ars.reset)
+	ars.stopping = false
+	ars.stop = nil
+}
+
+// IsRunning returns true if the state is running or in the process of stopping.
+func (ars *AtomicRunState) IsRunning() bool {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	return ars.stop != nil
+}
+
+// IsStopping returns true if the run state has been stopped, but not yet reset.
+func (ars *AtomicRunState) IsStopping() bool {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	return ars.stopping && ars.stop != nil
+}
+
+// WaitForStop will wait for a stop to occur IFF the run state is in the process of stopping.
+func (ars *AtomicRunState) WaitForReset() {
+	if ars.IsStopping() {
+		<-ars.reset
+	}
+}

+ 174 - 0
pkg/util/atomic/atomicrunstate_test.go

@@ -0,0 +1,174 @@
+package atomic
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+// NOTE: This test uses time.Sleep() in an attempt to specifically schedule concurrent actions for testing
+// NOTE: Testing concurrency is hard, so if there are inconsistent results, make sure it's not just the timing
+// NOTE: of the test on the testing hardware.
+func TestRunState(t *testing.T) {
+	t.Parallel()
+
+	var ars AtomicRunState
+
+	if !ars.Start() {
+		t.Fatalf("Failed to Start() AtomicRunState")
+	}
+
+	if ars.Start() {
+		t.Fatalf("Started AtomicRunState a second time")
+	}
+
+	success := make(chan bool)
+
+	go func() {
+		cycles := 0
+		for {
+			// Our test expects exactly 1 cycle, so if we exceed that, we fail!
+			if cycles >= 2 {
+				success <- false
+				return
+			}
+			// create a "work" time before the select
+			time.Sleep(1 * time.Second)
+
+			select {
+			case <-ars.OnStop():
+				t.Logf("Stopped\n")
+				ars.Reset()
+				success <- true
+				return
+			case <-time.After(2 * time.Second):
+				t.Logf("Tick\n")
+			}
+			cycles++
+		}
+	}()
+
+	// Wait for one full work cycle (3 seconds), attempt Stop during "work" phase
+	time.Sleep(3500 * time.Millisecond)
+	ars.Stop()
+
+	result := <-success
+	if !result {
+		t.Fatalf("Executed too many work cycles, expected 1 cycle")
+	}
+}
+
+// leaks goroutines potentially, so only use in testing!
+func waitChannelFor(wg *sync.WaitGroup) chan bool {
+	ch := make(chan bool)
+	go func() {
+		wg.Wait()
+		ch <- true
+	}()
+	return ch
+}
+
+func TestDoubleWait(t *testing.T) {
+	t.Parallel()
+
+	var ars AtomicRunState
+
+	ars.WaitForReset()
+
+	if !ars.Start() {
+		t.Fatalf("Failed to Start() AtomicRunState")
+	}
+
+	if ars.Start() {
+		t.Fatalf("Started AtomicRunState a second time")
+	}
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	go func() {
+		t.Logf("GoRoutine 1 Waiting....")
+		<-ars.OnStop()
+		wg.Done()
+	}()
+
+	go func() {
+		t.Logf("GoRoutine 2 Waiting....")
+		<-ars.OnStop()
+		wg.Done()
+	}()
+
+	time.Sleep(1 * time.Second)
+	ars.Stop()
+	select {
+	case <-time.After(time.Second):
+		t.Fatalf("Did not receive signal from both go routines after a second\n")
+		return
+	case <-waitChannelFor(&wg):
+		t.Logf("Received signals from both go routines\n")
+	}
+	ars.Reset()
+}
+
+func TestContinuousConcurrentStartsAndStops(t *testing.T) {
+	t.Parallel()
+
+	const cycles = 5
+
+	var ars AtomicRunState
+	started := make(chan bool)
+
+	var wg sync.WaitGroup
+	wg.Add(cycles)
+
+	// continuously try and start the ars on a tight loop
+	// throttled by OnStop and WaitForReset()
+	go func() {
+		defer func() {
+			if e := recover(); e != nil {
+				// sometimes the waitgroup will hit a negative value at the end of the test
+				// this is ok given the way the test behaves (chaos star/stop calls), so
+				// we can safely ignore.
+			}
+		}()
+
+		firstCycle := true
+		for {
+			ars.WaitForReset()
+			if ars.Start() {
+				t.Logf("Started")
+				if firstCycle {
+					firstCycle = false
+					started <- true
+				}
+				wg.Done()
+			}
+
+			<-ars.OnStop()
+			t.Logf("Stopped")
+		}
+	}()
+
+	// wait for an initial start
+	<-started
+
+	// Loop Stop/Resets from other goroutines
+	go func() {
+		for {
+			time.Sleep(100 * time.Millisecond)
+			if ars.Stop() {
+				<-ars.OnStop()
+				time.Sleep(500 * time.Millisecond)
+				ars.Reset()
+			}
+		}
+	}()
+
+	// Wait for full cycles
+	select {
+	case <-time.After(5 * time.Second):
+		t.Fatalf("Didn't complete %d cycles after 10 seconds", cycles)
+	case <-waitChannelFor(&wg):
+		t.Logf("Completed!")
+	}
+}

+ 139 - 0
pkg/util/atomic/example_intervalrunner_test.go

@@ -0,0 +1,139 @@
+package atomic_test
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/util/atomic"
+)
+
+// IntervalRunner is an example implementation of AtomicRunState.
+type IntervalRunner struct {
+	runState atomic.AtomicRunState
+	action   func()
+	interval time.Duration
+}
+
+// NewIntervalRunner Creates a new instance of an interval runner to execute the provided
+// function on a designated interval until explicitly stopped.
+func NewIntervalRunner(action func(), interval time.Duration) *IntervalRunner {
+	return &IntervalRunner{
+		action:   action,
+		interval: interval,
+	}
+}
+
+// Start begins the interval execution. It returns true if the interval execution successfully starts.
+// It will return false if the interval execcution is already running.
+func (ir *IntervalRunner) Start() bool {
+	// Before we attempt to start, we must ensure we are not in a stopping state, this is a common
+	// pattern that should be used with the AtomicRunState
+	ir.runState.WaitForReset()
+
+	// This will atomically check the current state to ensure we can run, then advances the state.
+	// If the state is already started, it will return false.
+	if !ir.runState.Start() {
+		return false
+	}
+
+	// our run state is advanced, let's execute our action on the interval
+	// spawn a new goroutine which will loop and wait the interval each iteration
+	go func() {
+		for {
+			// use a select statement to receive whichever channel receives data first
+			select {
+			// if our stop channel receives data, it means we have explicitly called
+			// Stop(), and must reset our AtomicRunState to it's initial idle state
+			case <-ir.runState.OnStop():
+				ir.runState.Reset()
+				return // exit go routine
+
+			// After our interval elapses, fall through
+			case <-time.After(ir.interval):
+			}
+
+			// Execute the function
+			ir.action()
+
+			// Loop back to the select where we will wait for the interval to elapse
+			// or an explicit stop to be called
+		}
+	}()
+
+	return true
+}
+
+// Stop will explicitly stop the execution of the interval runner. If an action is already executing, it will wait
+// until completion before processing the stop. Any attempts to start during the stopping phase will block until
+// it's possible to Start() again
+func (ir *IntervalRunner) Stop() bool {
+	return ir.runState.Stop()
+}
+
+func Example_intervalRunner() {
+	count := 0
+
+	// As a general test, we'll use a goroutine which waits for a specific number of
+	// ticks before calling stop, then issues a signal back to the main thread
+	var wg sync.WaitGroup
+	wg.Add(4)
+
+	// Create a new IntervalRunner instance to execute our print action every second
+	ir := NewIntervalRunner(
+		func() {
+			fmt.Printf("Tick[%d]\n", count)
+			count++
+			// advance the wait group count
+			wg.Done()
+		},
+		time.Second,
+	)
+
+	// Start the runner, panic on failure
+	if !ir.Start() {
+		panic("Failed to start interval runner!")
+	}
+
+	// spin up a second goroutine which will wait for a specific number of
+	// ticks before calling Stop(). This is a bit contrived, but demonstrates
+	// multiple goroutines controlling the same interval runner.
+	complete := make(chan bool)
+	go func() {
+		wg.Wait()
+
+		// Stop the interval runner, notify main thread
+		ir.Stop()
+		complete <- true
+	}()
+
+	<-complete
+
+	// Start immediately again using a different total tick count
+	count = 0
+	wg.Add(2)
+
+	// Start the runner, panic on failure
+	if !ir.Start() {
+		panic("Failed to start interval runner!")
+	}
+
+	// Create a new Stop waiter
+	go func() {
+		wg.Wait()
+
+		// Stop the interval runner, notify main thread
+		ir.Stop()
+		complete <- true
+	}()
+
+	<-complete
+
+	// Output:
+	// Tick[0]
+	// Tick[1]
+	// Tick[2]
+	// Tick[3]
+	// Tick[0]
+	// Tick[1]
+}

+ 34 - 8
test/cloud_test.go

@@ -10,6 +10,7 @@ import (
 
 	"github.com/kubecost/cost-model/pkg/cloud"
 	"github.com/kubecost/cost-model/pkg/clustercache"
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/costmodel"
 	"github.com/kubecost/cost-model/pkg/costmodel/clusters"
 
@@ -123,6 +124,10 @@ func TestNodePriceFromCSV(t *testing.T) {
 	nameWant := "gke-standard-cluster-1-pool-1-91dc432d-cg69"
 	labelFooWant := "labelfoo"
 
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	n := &v1.Node{}
 	n.Spec.ProviderID = providerIDWant
 	n.Name = nameWant
@@ -134,7 +139,7 @@ func TestNodePriceFromCSV(t *testing.T) {
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 	c.DownloadPricingData()
@@ -164,7 +169,7 @@ func TestNodePriceFromCSV(t *testing.T) {
 	c2 := &cloud.CSVProvider{
 		CSVLocation: "../configs/fake.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 	k3 := c.GetKey(n.Labels, n)
@@ -179,6 +184,10 @@ func TestNodePriceFromCSVWithRegion(t *testing.T) {
 	nameWant := "foo"
 	labelFooWant := "labelfoo"
 
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	n := &v1.Node{}
 	n.Spec.ProviderID = providerIDWant
 	n.Name = nameWant
@@ -206,7 +215,7 @@ func TestNodePriceFromCSVWithRegion(t *testing.T) {
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_region.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 	c.DownloadPricingData()
@@ -256,7 +265,7 @@ func TestNodePriceFromCSVWithRegion(t *testing.T) {
 	c2 := &cloud.CSVProvider{
 		CSVLocation: "../configs/fake.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 	k5 := c.GetKey(n.Labels, n)
@@ -291,10 +300,14 @@ type FakeClusterMap struct {
 
 func TestNodePriceFromCSVWithBadConfig(t *testing.T) {
 	os.Setenv("CONFIG_PATH", "../config")
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_case.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("invalid.json"),
+			Config: cloud.NewProviderConfig(confMan, "invalid.json"),
 		},
 	}
 	c.DownloadPricingData()
@@ -320,10 +333,15 @@ func TestNodePriceFromCSVWithBadConfig(t *testing.T) {
 
 func TestSourceMatchesFromCSV(t *testing.T) {
 	os.Setenv("CONFIG_PATH", "../configs")
+
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_case.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "/default.json"),
 		},
 	}
 	c.DownloadPricingData()
@@ -395,10 +413,14 @@ func TestNodePriceFromCSVWithCase(t *testing.T) {
 	n.Labels[v1.LabelZoneRegion] = "eastus2"
 	wantPrice := "0.13370357"
 
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_case.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 
@@ -425,10 +447,14 @@ func TestNodePriceFromCSVByClass(t *testing.T) {
 	wantpricefloat := 0.13370357
 	wantPrice := fmt.Sprintf("%f", (math.Round(wantpricefloat*1000000) / 1000000))
 
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_case.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 483 - 648
ui/package-lock.json


+ 1 - 1
ui/package.json

@@ -19,7 +19,7 @@
     "@material-ui/icons": "^4.11.2",
     "@material-ui/pickers": "^3.3.10",
     "@material-ui/styles": "^4.11.3",
-    "axios": "^0.21.1",
+    "axios": "^0.21.2",
     "date-fns": "^2.19.0",
     "material-design-icons-iconfont": "^6.1.0",
     "prop-types": "^15.7.2",

Kaikkia tiedostoja ei voida näyttää, sillä liian monta tiedostoa muuttui tässä diffissä