Просмотр исходного кода

Merge pull request #987 from kubecost/bolt/remote-config

Kubecost Agent: Remote Configuration and Synchronization
Matt Bolt 4 лет назад
Родитель
Сommit
e6fcdb77d6

+ 34 - 12
cmd/kubemetrics/main.go

@@ -9,6 +9,7 @@ import (
 
 	"github.com/kubecost/cost-model/pkg/cloud"
 	"github.com/kubecost/cost-model/pkg/clustercache"
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/costmodel"
 	"github.com/kubecost/cost-model/pkg/costmodel/clusters"
 	"github.com/kubecost/cost-model/pkg/env"
@@ -27,6 +28,13 @@ import (
 	"k8s.io/klog"
 )
 
+// ClusterExportInterval is the interval used to export the cluster if env.IsExportClusterCacheEnabled() is true
+const ClusterExportInterval = 5 * time.Minute
+
+// clusterExporter is used if env.IsExportClusterCacheEnabled() is set to true
+// it will export the kubernetes cluster data to a file on a specific interval
+var clusterExporter *clustercache.ClusterExporter
+
 func Healthz(w http.ResponseWriter, _ *http.Request) {
 	w.WriteHeader(200)
 	w.Header().Set("Content-Length", "0")
@@ -34,7 +42,7 @@ func Healthz(w http.ResponseWriter, _ *http.Request) {
 }
 
 // initializes the kubernetes client cache
-func newKubernetesClusterCache() (clustercache.ClusterCache, error) {
+func newKubernetesClusterCache() (kubernetes.Interface, clustercache.ClusterCache, error) {
 	var err error
 
 	// Kubernetes API setup
@@ -46,19 +54,19 @@ func newKubernetesClusterCache() (clustercache.ClusterCache, error) {
 	}
 
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
 	kubeClientset, err := kubernetes.NewForConfig(kc)
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
 	// Create Kubernetes Cluster Cache + Watchers
 	k8sCache := clustercache.NewKubernetesClusterCache(kubeClientset)
 	k8sCache.Run()
 
-	return k8sCache, nil
+	return kubeClientset, k8sCache, nil
 }
 
 func newPrometheusClient() (prometheus.Client, error) {
@@ -124,13 +132,19 @@ func main() {
 	klog.Infof("Using scrape interval of %f", scrapeInterval.Seconds())
 
 	// initialize kubernetes client and cluster cache
-	clusterCache, err := newKubernetesClusterCache()
+	k8sClient, clusterCache, err := newKubernetesClusterCache()
 	if err != nil {
 		panic(err.Error())
 	}
 
+	// Create ConfigFileManager for synchronization of shared configuration
+	confManager := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		BucketStoreConfig: env.GetKubecostConfigBucket(),
+		LocalConfigPath:   "/",
+	})
+
 	cloudProviderKey := env.GetCloudProviderAPIKey()
-	cloudProvider, err := cloud.NewProvider(clusterCache, cloudProviderKey)
+	cloudProvider, err := cloud.NewProvider(clusterCache, cloudProviderKey, confManager)
 	if err != nil {
 		panic(err.Error())
 	}
@@ -140,7 +154,6 @@ func main() {
 	watchConfigFunc := configWatchers.ToWatchFunc()
 	watchedConfigs := configWatchers.GetWatchedConfigs()
 
-	k8sClient := clusterCache.GetClient()
 	kubecostNamespace := env.GetKubecostNamespace()
 
 	// We need an initial invocation because the init of the cache has happened before we had access to the provider.
@@ -155,16 +168,25 @@ func main() {
 
 	clusterCache.SetConfigMapUpdateFunc(watchConfigFunc)
 
+	// Initialize cluster exporting if it's enabled
+	if env.IsExportClusterCacheEnabled() {
+		cacheLocation := confManager.ConfigFileAt("/var/configs/cluster-cache.json")
+		clusterExporter = clustercache.NewClusterExporter(clusterCache, cacheLocation, ClusterExportInterval)
+		clusterExporter.Run()
+	}
+
+	// ClusterInfo Provider to provide the cluster map with local and remote cluster data
+	clusterInfoConf := confManager.ConfigFileAt("/var/configs/cluster-info.json")
+	localClusterInfo := costmodel.NewLocalClusterInfoProvider(k8sClient, cloudProvider)
+	clusterInfoProvider := costmodel.NewClusterInfoWriteOnRequest(localClusterInfo, clusterInfoConf)
+
 	// Initialize ClusterMap for maintaining ClusterInfo by ClusterID
-	clusterMap := clusters.NewClusterMap(
-		promCli,
-		costmodel.NewLocalClusterInfoProvider(k8sClient, cloudProvider),
-		5*time.Minute)
+	clusterMap := clusters.NewClusterMap(promCli, clusterInfoProvider, 5*time.Minute)
 
 	costModel := costmodel.NewCostModel(promCli, cloudProvider, clusterCache, clusterMap, scrapeInterval)
 
 	// initialize Kubernetes Metrics Emitter
-	metricsEmitter := costmodel.NewCostModelMetricsEmitter(promCli, clusterCache, cloudProvider, costModel)
+	metricsEmitter := costmodel.NewCostModelMetricsEmitter(promCli, clusterCache, cloudProvider, clusterInfoProvider, costModel)
 
 	// download pricing data
 	err = cloudProvider.DownloadPricingData()

+ 3 - 1
go.mod

@@ -15,13 +15,15 @@ require (
 	github.com/aws/aws-sdk-go-v2 v1.9.0
 	github.com/davecgh/go-spew v1.1.1
 	github.com/getsentry/sentry-go v0.6.1
-	github.com/google/uuid v1.1.2
+	github.com/google/uuid v1.3.0
 	github.com/json-iterator/go v1.1.10
 	github.com/jszwec/csvutil v1.2.1
 	github.com/julienschmidt/httprouter v1.3.0
 	github.com/lib/pq v1.2.0
 	github.com/microcosm-cc/bluemonday v1.0.5
+	github.com/minio/minio-go/v7 v7.0.15
 	github.com/patrickmn/go-cache v2.1.0+incompatible
+	github.com/pkg/errors v0.9.1
 	github.com/prometheus/client_golang v1.0.0
 	github.com/prometheus/client_model v0.2.0
 	github.com/rs/cors v1.7.0

+ 27 - 1
go.sum

@@ -111,6 +111,7 @@ github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi
 github.com/dimchansky/utfbom v1.1.1/go.mod h1:SxdoEBH5qIqFocHMyGOXVAybYJdr71b1Q/j0mACtrfE=
 github.com/docker/spdystream v0.0.0-20160310174837-449fdfce4d96/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM=
 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/eknkc/amber v0.0.0-20171010120322-cdade1c07385/go.mod h1:0vRUJqYpeSZifjYj7uP3BG/gKcuzL9xWVV/Y+cK33KM=
 github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc=
@@ -207,13 +208,15 @@ github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hf
 github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
 github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
 github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
+github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
 github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM=
 github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
 github.com/googleapis/gnostic v0.4.1 h1:DLJCy1n/vrD4HPjOvYcT8aYQXpPIzoRZONaYwyycI+I=
 github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3ir6b65WBswg=
+github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
 github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
 github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY=
 github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c=
@@ -247,6 +250,7 @@ github.com/jstemmer/go-junit-report v0.9.1 h1:6QPYqodiu3GuPL+7mfx+NwDdp2eTkp9IfE
 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
 github.com/jszwec/csvutil v1.2.1 h1:9+vmGqMdYxIbeDmVbTrVryibx2izwHAfKdPwl4GPNHM=
 github.com/jszwec/csvutil v1.2.1/go.mod h1:8YHz6C3KVdIeCxLMvwbbIVDCTA/Wi2df93AZlQNaE2U=
+github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
 github.com/juju/errors v0.0.0-20181118221551-089d3ea4e4d5/go.mod h1:W54LbzXuIE0boCoNJfwqpmkKJ1O4TCTZMetAt6jGk7Q=
 github.com/juju/loggo v0.0.0-20180524022052-584905176618/go.mod h1:vgyd7OREkbtVEN/8IXZe5Ooef3LQePvuBm9UWj6ZL8U=
@@ -263,7 +267,12 @@ github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQL
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.8.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.9.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
+github.com/klauspost/compress v1.13.5 h1:9O69jUPDcsT9fEm74W92rZL9FQY7rCdaXVneq+yyzl4=
+github.com/klauspost/compress v1.13.5/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
 github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
+github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
+github.com/klauspost/cpuid v1.3.1 h1:5JNjFYYQrZeKRJ0734q51WCEEn2huer72Dc7K+R/b6s=
+github.com/klauspost/cpuid v1.3.1/go.mod h1:bYW4mA6ZgKPob1/Dlai2LviZJO7KGI3uoWLd42rAQw4=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
@@ -293,6 +302,12 @@ github.com/mediocregopher/radix/v3 v3.3.0/go.mod h1:EmfVyvspXz1uZEyPBMyGK+kjWiKQ
 github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/leAFZyRl6bYmGDlGc=
 github.com/microcosm-cc/bluemonday v1.0.5 h1:cF59UCKMmmUgqN1baLvqU/B1ZsMori+duLVTLpgiG3w=
 github.com/microcosm-cc/bluemonday v1.0.5/go.mod h1:8iwZnFn2CDDNZ0r6UXhF4xawGvzaqzCRa1n3/lO3W2w=
+github.com/minio/md5-simd v1.1.0 h1:QPfiOqlZH+Cj9teu0t9b1nTBfPbyTl16Of5MeuShdK4=
+github.com/minio/md5-simd v1.1.0/go.mod h1:XpBqgZULrMYD3R+M28PcmP0CkI7PEMzB3U77ZrKZ0Gw=
+github.com/minio/minio-go/v7 v7.0.15 h1:r9/NhjJ+nXYrIYvbObhvc1wPj3YH1iDpJzz61uRKLyY=
+github.com/minio/minio-go/v7 v7.0.15/go.mod h1:pUV0Pc+hPd1nccgmzQF/EXh48l/Z/yps6QPF1aaie4g=
+github.com/minio/sha256-simd v0.1.1 h1:5QHSlgo3nt5yKOJrC7W8w7X+NFl8cMPZm96iu8kKUJU=
+github.com/minio/sha256-simd v0.1.1/go.mod h1:B5e1o+1/KgNmWrSQK08Y6Z1Vb5pwIktudl0J58iy0KM=
 github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
 github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
@@ -346,6 +361,8 @@ github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsT
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
 github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik=
 github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU=
+github.com/rs/xid v1.2.1 h1:mhH9Nq+C1fY2l1XIpgxIiUOfNpRBYH1kKcr+qfKgjRc=
+github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ=
 github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
 github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
 github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww=
@@ -355,7 +372,11 @@ github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24 h1:pntxY8Ary0t4
 github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24/go.mod h1:M+9NzErvs504Cn4c5DxATwIqPbtswREoFCre64PpcG4=
 github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
+github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
+github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
+github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
+github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
 github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
 github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
 github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk=
@@ -409,6 +430,7 @@ golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8U
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20201216223049-8b5274cf687f/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
 golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad h1:DN0cp81fZ3njFcrLCytUHRSUkqBjfTo4Tx9RJTWs0EY=
 golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -467,6 +489,7 @@ golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20201110031124-69a78807bb2b h1:uwuIcX0g4Yl1NC5XAz37xsr2lTtcqevgzYNVt49waME=
 golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
@@ -513,6 +536,7 @@ golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200828194041-157a740278f4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201112073958-5cba982894dd h1:5CtCZbICpIOFdgO940moixOPjc0178IU44m4EjOO5IY=
@@ -639,6 +663,8 @@ gopkg.in/go-playground/assert.v1 v1.2.1/go.mod h1:9RXL0bg/zibRAgZUYszZSwO/z8Y/a8
 gopkg.in/go-playground/validator.v8 v8.18.2/go.mod h1:RX2a/7Ha8BgOhfk7j780h4/u/RRjR0eouCJSH80/M2Y=
 gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
 gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
+gopkg.in/ini.v1 v1.57.0 h1:9unxIsFcTt4I55uWluz+UmL95q4kdJ0buvQ1ZIqVQww=
+gopkg.in/ini.v1 v1.57.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/mgo.v2 v2.0.0-20180705113604-9856a29383ce/go.mod h1:yeKp02qBN3iKW1OzL3MGk2IdtZzaj7SFntXj72NppTA=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

+ 1 - 1
pkg/cloud/awsprovider.go

@@ -2020,7 +2020,7 @@ func (a *AWS) ExternalAllocations(start string, end string, aggregators []string
 		}
 
 		if customPricing.BillingDataDataset != "" && !crossCluster { // There is GCP data, meaning someone has tried to configure a GCP out-of-cluster allocation.
-			gcp, err := NewCrossClusterProvider("gcp", "aws.json", a.Clientset)
+			gcp, err := NewCrossClusterProvider("gcp", a.Config.ConfigFileManager(), "aws.json", a.Clientset)
 			if err != nil {
 				klog.Infof("Could not instantiate cross-cluster provider %s", err.Error())
 			}

+ 1 - 1
pkg/cloud/gcpprovider.go

@@ -352,7 +352,7 @@ func (gcp *GCP) ExternalAllocations(start string, end string, aggregators []stri
 
 	var s []*OutOfClusterAllocation
 	if c.ServiceKeyName != "" && c.ServiceKeySecret != "" && !crossCluster {
-		aws, err := NewCrossClusterProvider("aws", "gcp.json", gcp.Clientset)
+		aws, err := NewCrossClusterProvider("aws", gcp.Config.ConfigFileManager(), "gcp.json", gcp.Clientset)
 		if err != nil {
 			klog.Infof("Could not instantiate cross-cluster provider %s", err.Error())
 		}

+ 19 - 17
pkg/cloud/provider.go

@@ -4,18 +4,20 @@ import (
 	"database/sql"
 	"errors"
 	"fmt"
-	"github.com/kubecost/cost-model/pkg/util"
 	"io"
 	"regexp"
 	"strconv"
 	"strings"
 	"time"
 
+	"github.com/kubecost/cost-model/pkg/util"
+
 	"k8s.io/klog"
 
 	"cloud.google.com/go/compute/metadata"
 
 	"github.com/kubecost/cost-model/pkg/clustercache"
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/util/watcher"
@@ -383,31 +385,31 @@ func ShareTenancyCosts(p Provider) bool {
 	return config.ShareTenancyCosts == "true"
 }
 
-func NewCrossClusterProvider(ctype string, overrideConfigPath string, cache clustercache.ClusterCache) (Provider, error) {
+func NewCrossClusterProvider(ctype string, config *config.ConfigFileManager, overrideConfigPath string, cache clustercache.ClusterCache) (Provider, error) {
 	if ctype == "aws" {
 		return &AWS{
 			Clientset: cache,
-			Config:    NewProviderConfig(overrideConfigPath),
+			Config:    NewProviderConfig(config, overrideConfigPath),
 		}, nil
 	} else if ctype == "gcp" {
 		return &GCP{
 			Clientset: cache,
-			Config:    NewProviderConfig(overrideConfigPath),
+			Config:    NewProviderConfig(config, overrideConfigPath),
 		}, nil
 	} else if ctype == "azure" {
 		return &Azure{
 			Clientset: cache,
-			Config:    NewProviderConfig(overrideConfigPath),
+			Config:    NewProviderConfig(config, overrideConfigPath),
 		}, nil
 	}
 	return &CustomProvider{
 		Clientset: cache,
-		Config:    NewProviderConfig(overrideConfigPath),
+		Config:    NewProviderConfig(config, overrideConfigPath),
 	}, nil
 }
 
 // NewProvider looks at the nodespec or provider metadata server to decide which provider to instantiate.
-func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, error) {
+func NewProvider(cache clustercache.ClusterCache, apiKey string, config *config.ConfigFileManager) (Provider, error) {
 	nodes := cache.GetAllNodes()
 	if len(nodes) == 0 {
 		return nil, fmt.Errorf("Could not locate any nodes for cluster.")
@@ -422,7 +424,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 			CSVLocation: env.GetCSVPath(),
 			CustomProvider: &CustomProvider{
 				Clientset: cache,
-				Config:    NewProviderConfig(cp.configFileName),
+				Config:    NewProviderConfig(config, cp.configFileName),
 			},
 		}, nil
 	case "GCP":
@@ -433,7 +435,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 		return &GCP{
 			Clientset:        cache,
 			APIKey:           apiKey,
-			Config:           NewProviderConfig(cp.configFileName),
+			Config:           NewProviderConfig(config, cp.configFileName),
 			clusterRegion:    cp.region,
 			clusterProjectId: cp.projectID,
 		}, nil
@@ -441,7 +443,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 		klog.V(2).Info("Found ProviderID starting with \"aws\", using AWS Provider")
 		return &AWS{
 			Clientset:        cache,
-			Config:           NewProviderConfig(cp.configFileName),
+			Config:           NewProviderConfig(config, cp.configFileName),
 			clusterRegion:    cp.region,
 			clusterAccountId: cp.accountID,
 		}, nil
@@ -449,7 +451,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 		klog.V(2).Info("Found ProviderID starting with \"azure\", using Azure Provider")
 		return &Azure{
 			Clientset:        cache,
-			Config:           NewProviderConfig(cp.configFileName),
+			Config:           NewProviderConfig(config, cp.configFileName),
 			clusterRegion:    cp.region,
 			clusterAccountId: cp.accountID,
 		}, nil
@@ -457,7 +459,7 @@ func NewProvider(cache clustercache.ClusterCache, apiKey string) (Provider, erro
 		klog.V(2).Info("Unsupported provider, falling back to default")
 		return &CustomProvider{
 			Clientset: cache,
-			Config:    NewProviderConfig(cp.configFileName),
+			Config:    NewProviderConfig(config, cp.configFileName),
 		}, nil
 	}
 }
@@ -470,15 +472,15 @@ type clusterProperties struct {
 	projectID      string
 }
 
-func getClusterProperties(node *v1.Node) (clusterProperties) {
+func getClusterProperties(node *v1.Node) clusterProperties {
 	providerID := strings.ToLower(node.Spec.ProviderID)
 	region, _ := util.GetRegion(node.Labels)
 	cp := clusterProperties{
-		provider: "DEFAULT",
+		provider:       "DEFAULT",
 		configFileName: "default.json",
-		region: region,
-		accountID: "",
-		projectID: "",
+		region:         region,
+		accountID:      "",
+		projectID:      "",
 	}
 	if metadata.OnGCE() {
 		cp.provider = "GCP"

+ 69 - 34
pkg/cloud/providerconfig.go

@@ -2,14 +2,15 @@ package cloud
 
 import (
 	"fmt"
-	"io/ioutil"
+	gopath "path"
 	"reflect"
 	"strconv"
 	"strings"
 	"sync"
 
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/env"
-	"github.com/kubecost/cost-model/pkg/util/fileutil"
+	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/util/json"
 	"github.com/microcosm-cc/bluemonday"
 
@@ -18,23 +19,61 @@ import (
 
 var sanitizePolicy = bluemonday.UGCPolicy()
 
-// ProviderConfig is a utility class that provides a thread-safe configuration
-// storage/cache for all Provider implementations
+// ProviderConfig is a utility class that provides a thread-safe configuration storage/cache for all Provider
+// implementations
 type ProviderConfig struct {
-	lock          *sync.Mutex
-	fileName      string
-	configPath    string
-	customPricing *CustomPricing
+	lock            *sync.Mutex
+	configManager   *config.ConfigFileManager
+	configFile      *config.ConfigFile
+	customPricing   *CustomPricing
+	watcherHandleID config.HandlerID
 }
 
-// Creates a new ProviderConfig instance
-func NewProviderConfig(file string) *ProviderConfig {
-	return &ProviderConfig{
+// NewProviderConfig creates a new ConfigFile and returns the ProviderConfig
+func NewProviderConfig(configManager *config.ConfigFileManager, fileName string) *ProviderConfig {
+	configFile := configManager.ConfigFileAt(configPathFor(fileName))
+	pc := &ProviderConfig{
 		lock:          new(sync.Mutex),
-		fileName:      file,
-		configPath:    configPathFor(file),
+		configManager: configManager,
+		configFile:    configFile,
 		customPricing: nil,
 	}
+
+	// add the provider config func as handler for the config file changes
+	pc.watcherHandleID = configFile.AddChangeHandler(pc.onConfigFileUpdated)
+	return pc
+}
+
+// onConfigFileUpdated handles any time the config file contents are updated, created, or deleted
+func (pc *ProviderConfig) onConfigFileUpdated(changeType config.ChangeType, data []byte) {
+	// TODO: (bolt) Currently this has the side-effect of setting pc.customPricing twice when the update
+	// TODO: (bolt) is made from this ProviderConfig instance. We'll need to implement a way of identifying
+	// TODO: (bolt) when to ignore updates when the change and handler are the same source
+	log.Infof("CustomPricing Config Updated: %s", changeType)
+
+	switch changeType {
+	case config.ChangeTypeCreated:
+		fallthrough
+	case config.ChangeTypeModified:
+		pc.lock.Lock()
+		defer pc.lock.Unlock()
+
+		customPricing := new(CustomPricing)
+		err := json.Unmarshal(data, customPricing)
+		if err != nil {
+			klog.Infof("Could not decode Custom Pricing file at path %s. Using default.", pc.configFile.Path())
+			customPricing = DefaultPricing()
+		}
+
+		pc.customPricing = customPricing
+		if pc.customPricing.SpotGPU == "" {
+			pc.customPricing.SpotGPU = DefaultPricing().SpotGPU // Migration for users without this value set by default.
+		}
+
+		if pc.customPricing.ShareTenancyCosts == "" {
+			pc.customPricing.ShareTenancyCosts = defaultShareTenancyCost
+		}
+	}
 }
 
 // Non-ThreadSafe logic to load the config file if a cache does not exist. Flag to write
@@ -44,16 +83,16 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*CustomPricing, err
 		return pc.customPricing, nil
 	}
 
-	exists, err := fileExists(pc.configPath)
+	exists, err := pc.configFile.Exists()
 	// File Error other than NotExists
 	if err != nil {
-		klog.Infof("Custom Pricing file at path '%s' read error: '%s'", pc.configPath, err.Error())
+		klog.Infof("Custom Pricing file at path '%s' read error: '%s'", pc.configFile.Path(), err.Error())
 		return DefaultPricing(), err
 	}
 
 	// File Doesn't Exist
 	if !exists {
-		klog.Infof("Could not find Custom Pricing file at path '%s'", pc.configPath)
+		klog.Infof("Could not find Custom Pricing file at path '%s'", pc.configFile.Path())
 		pc.customPricing = DefaultPricing()
 
 		// Only write the file if flag enabled
@@ -63,9 +102,9 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*CustomPricing, err
 				return pc.customPricing, err
 			}
 
-			err = ioutil.WriteFile(pc.configPath, cj, 0644)
+			err = pc.configFile.Write(cj)
 			if err != nil {
-				klog.Infof("Could not write Custom Pricing file to path '%s'", pc.configPath)
+				klog.Infof("Could not write Custom Pricing file to path '%s'", pc.configFile.Path())
 				return pc.customPricing, err
 			}
 		}
@@ -74,9 +113,9 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*CustomPricing, err
 	}
 
 	// File Exists - Read all contents of file, unmarshal json
-	byteValue, err := ioutil.ReadFile(pc.configPath)
+	byteValue, err := pc.configFile.Read()
 	if err != nil {
-		klog.Infof("Could not read Custom Pricing file at path %s", pc.configPath)
+		klog.Infof("Could not read Custom Pricing file at path %s", pc.configFile.Path())
 		// If read fails, we don't want to cache default, assuming that the file is valid
 		return DefaultPricing(), err
 	}
@@ -84,7 +123,7 @@ func (pc *ProviderConfig) loadConfig(writeIfNotExists bool) (*CustomPricing, err
 	var customPricing CustomPricing
 	err = json.Unmarshal(byteValue, &customPricing)
 	if err != nil {
-		klog.Infof("Could not decode Custom Pricing file at path %s", pc.configPath)
+		klog.Infof("Could not decode Custom Pricing file at path %s", pc.configFile.Path())
 		return DefaultPricing(), err
 	}
 
@@ -108,6 +147,13 @@ func (pc *ProviderConfig) GetCustomPricingData() (*CustomPricing, error) {
 	return pc.loadConfig(true)
 }
 
+// ConfigFileManager returns the ConfigFileManager instance used to manage the CustomPricing
+// configuration. In the event of a multi-provider setup, this instance should be used to
+// configure any other configuration providers.
+func (pc *ProviderConfig) ConfigFileManager() *config.ConfigFileManager {
+	return pc.configManager
+}
+
 // Allows a call to manually update the configuration while maintaining proper thread-safety
 // for read/write methods.
 func (pc *ProviderConfig) Update(updateFunc func(*CustomPricing) error) (*CustomPricing, error) {
@@ -132,7 +178,7 @@ func (pc *ProviderConfig) Update(updateFunc func(*CustomPricing) error) (*Custom
 	if err != nil {
 		return c, err
 	}
-	err = ioutil.WriteFile(pc.configPath, cj, 0644)
+	err = pc.configFile.Write(cj)
 
 	if err != nil {
 		return c, err
@@ -210,19 +256,8 @@ func SetCustomPricingField(obj *CustomPricing, name string, value string) error
 	return nil
 }
 
-// File exists has three different return cases that should be handled:
-//   1. File exists and is not a directory (true, nil)
-//   2. File does not exist (false, nil)
-//   3. File may or may not exist. Error occurred during stat (false, error)
-// The third case represents the scenario where the stat returns an error,
-// but the error isn't relevant to the path. This can happen when the current
-// user doesn't have permission to access the file.
-func fileExists(filename string) (bool, error) {
-	return fileutil.FileExists(filename) // delegate to utility method
-}
-
 // Returns the configuration directory concatenated with a specific config file name
 func configPathFor(filename string) string {
 	path := env.GetConfigPathWithDefault("/models/")
-	return path + filename
+	return gopath.Join(path, filename)
 }

+ 20 - 10
pkg/clustercache/clustercache.go

@@ -10,6 +10,7 @@ import (
 	autoscaling "k8s.io/api/autoscaling/v2beta1"
 	batchv1 "k8s.io/api/batch/v1"
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/api/policy/v1beta1"
 	stv1 "k8s.io/api/storage/v1"
 	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/client-go/kubernetes"
@@ -24,10 +25,6 @@ type ClusterCache interface {
 	// Stops the watcher processes
 	Stop()
 
-	// Gets the underlying clientset
-	// TODO: Remove once we support all cached cluster components
-	GetClient() kubernetes.Interface
-
 	// GetAllNamespaces returns all the cached namespaces
 	GetAllNamespaces() []*v1.Namespace
 
@@ -64,9 +61,12 @@ type ClusterCache interface {
 	// GetAllJobs returns all the cached jobs
 	GetAllJobs() []*batchv1.Job
 
-	// GetAllHorizontalPodAutoscalers() returns all cached horizontal pod autoscalers
+	// GetAllHorizontalPodAutoscalers returns all cached horizontal pod autoscalers
 	GetAllHorizontalPodAutoscalers() []*autoscaling.HorizontalPodAutoscaler
 
+	// GetAllPodDisruptionBudgets returns all cached pod disruption budgets
+	GetAllPodDisruptionBudgets() []*v1beta1.PodDisruptionBudget
+
 	// SetConfigMapUpdateFunc sets the configmap update function
 	SetConfigMapUpdateFunc(func(interface{}))
 }
@@ -89,6 +89,7 @@ type KubernetesClusterCache struct {
 	storageClassWatch      WatchController
 	jobsWatch              WatchController
 	hpaWatch               WatchController
+	pdbWatch               WatchController
 	stop                   chan struct{}
 }
 
@@ -103,6 +104,7 @@ func NewKubernetesClusterCache(client kubernetes.Interface) ClusterCache {
 	storageRestClient := client.StorageV1().RESTClient()
 	batchClient := client.BatchV1().RESTClient()
 	autoscalingClient := client.AutoscalingV2beta1().RESTClient()
+	pdbClient := client.PolicyV1beta1().RESTClient()
 
 	kubecostNamespace := env.GetKubecostNamespace()
 	klog.Infof("NAMESPACE: %s", kubecostNamespace)
@@ -123,11 +125,12 @@ func NewKubernetesClusterCache(client kubernetes.Interface) ClusterCache {
 		storageClassWatch:      NewCachingWatcher(storageRestClient, "storageclasses", &stv1.StorageClass{}, "", fields.Everything()),
 		jobsWatch:              NewCachingWatcher(batchClient, "jobs", &batchv1.Job{}, "", fields.Everything()),
 		hpaWatch:               NewCachingWatcher(autoscalingClient, "horizontalpodautoscalers", &autoscaling.HorizontalPodAutoscaler{}, "", fields.Everything()),
+		pdbWatch:               NewCachingWatcher(pdbClient, "poddisruptionbudgets", &v1beta1.PodDisruptionBudget{}, "", fields.Everything()),
 	}
 
 	// Wait for each caching watcher to initialize
 	var wg sync.WaitGroup
-	wg.Add(14)
+	wg.Add(15)
 
 	cancel := make(chan struct{})
 
@@ -145,6 +148,7 @@ func NewKubernetesClusterCache(client kubernetes.Interface) ClusterCache {
 	go initializeCache(kcc.storageClassWatch, &wg, cancel)
 	go initializeCache(kcc.jobsWatch, &wg, cancel)
 	go initializeCache(kcc.hpaWatch, &wg, cancel)
+	go initializeCache(kcc.podWatch, &wg, cancel)
 
 	wg.Wait()
 
@@ -171,6 +175,7 @@ func (kcc *KubernetesClusterCache) Run() {
 	go kcc.storageClassWatch.Run(1, stopCh)
 	go kcc.jobsWatch.Run(1, stopCh)
 	go kcc.hpaWatch.Run(1, stopCh)
+	go kcc.pdbWatch.Run(1, stopCh)
 
 	kcc.stop = stopCh
 }
@@ -184,10 +189,6 @@ func (kcc *KubernetesClusterCache) Stop() {
 	kcc.stop = nil
 }
 
-func (kcc *KubernetesClusterCache) GetClient() kubernetes.Interface {
-	return kcc.client
-}
-
 func (kcc *KubernetesClusterCache) GetAllNamespaces() []*v1.Namespace {
 	var namespaces []*v1.Namespace
 	items := kcc.namespaceWatch.GetAll()
@@ -305,6 +306,15 @@ func (kcc *KubernetesClusterCache) GetAllHorizontalPodAutoscalers() []*autoscali
 	return hpas
 }
 
+func (kcc *KubernetesClusterCache) GetAllPodDisruptionBudgets() []*v1beta1.PodDisruptionBudget {
+	var pdbs []*v1beta1.PodDisruptionBudget
+	items := kcc.pdbWatch.GetAll()
+	for _, pdb := range items {
+		pdbs = append(pdbs, pdb.(*v1beta1.PodDisruptionBudget))
+	}
+	return pdbs
+}
+
 func (kcc *KubernetesClusterCache) SetConfigMapUpdateFunc(f func(interface{})) {
 	kcc.kubecostConfigMapWatch.SetUpdateHandler(f)
 }

+ 114 - 0
pkg/clustercache/clusterexporter.go

@@ -0,0 +1,114 @@
+package clustercache
+
+import (
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/config"
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/util/atomic"
+	"github.com/kubecost/cost-model/pkg/util/json"
+
+	appsv1 "k8s.io/api/apps/v1"
+	autoscaling "k8s.io/api/autoscaling/v2beta1"
+	batchv1 "k8s.io/api/batch/v1"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/api/policy/v1beta1"
+	stv1 "k8s.io/api/storage/v1"
+)
+
+// clusterEncoding is used to represent the cluster objects in the encoded states.
+type clusterEncoding struct {
+	Namespaces               []*v1.Namespace                        `json:"namespaces,omitempty"`
+	Nodes                    []*v1.Node                             `json:"nodes,omitempty"`
+	Pods                     []*v1.Pod                              `json:"pods,omitempty"`
+	Services                 []*v1.Service                          `json:"services,omitempty"`
+	DaemonSets               []*appsv1.DaemonSet                    `json:"daemonSets,omitempty"`
+	Deployments              []*appsv1.Deployment                   `json:"deployments,omitempty"`
+	StatefulSets             []*appsv1.StatefulSet                  `json:"statefulSets,omitempty"`
+	ReplicaSets              []*appsv1.ReplicaSet                   `json:"replicaSets,omitempty"`
+	PersistentVolumes        []*v1.PersistentVolume                 `json:"persistentVolumes,omitempty"`
+	PersistentVolumeClaims   []*v1.PersistentVolumeClaim            `json:"persistentVolumeClaims,omitempty"`
+	StorageClasses           []*stv1.StorageClass                   `json:"storageClasses,omitempty"`
+	Jobs                     []*batchv1.Job                         `json:"jobs,omitempty"`
+	HorizontalPodAutoscalers []*autoscaling.HorizontalPodAutoscaler `json:"horizontalPodAutoscalers,omitempty"`
+	PodDisruptionBudgets     []*v1beta1.PodDisruptionBudget         `json:"podDisruptionBudgets,omitEmpty"`
+}
+
+// ClusterExporter manages and runs an file export process which dumps the local kubernetes cluster to a target location.
+type ClusterExporter struct {
+	cluster  ClusterCache
+	target   *config.ConfigFile
+	interval time.Duration
+	runState atomic.AtomicRunState
+}
+
+// NewClusterExporter creates a new ClusterExporter instance for exporting the kubernetes cluster.
+func NewClusterExporter(cluster ClusterCache, target *config.ConfigFile, interval time.Duration) *ClusterExporter {
+	return &ClusterExporter{
+		cluster:  cluster,
+		target:   target,
+		interval: interval,
+	}
+}
+
+// Run starts the automated process of running Export on a specific interval.
+func (ce *ClusterExporter) Run() {
+	// in the event there is a race that occurs between Run() and Stop(), we
+	// ensure that we wait for the reset to occur before starting again
+	ce.runState.WaitForReset()
+
+	if !ce.runState.Start() {
+		log.Warningf("ClusterExporter already running")
+		return
+	}
+
+	go func() {
+		for {
+			err := ce.Export()
+			if err != nil {
+				log.Warningf("Failed to export cluster: %s", err)
+			}
+
+			select {
+			case <-time.After(ce.interval):
+			case <-ce.runState.OnStop():
+				ce.runState.Reset()
+				return
+			}
+		}
+	}()
+}
+
+// Stop halts the Cluster export on an interval
+func (ce *ClusterExporter) Stop() {
+	ce.runState.Stop()
+}
+
+// Export stores the cluster cache data into a PODO, marshals as JSON, and saves it to the
+// target location.
+func (ce *ClusterExporter) Export() error {
+	c := ce.cluster
+	encoding := &clusterEncoding{
+		Namespaces:               c.GetAllNamespaces(),
+		Nodes:                    c.GetAllNodes(),
+		Pods:                     c.GetAllPods(),
+		Services:                 c.GetAllServices(),
+		DaemonSets:               c.GetAllDaemonSets(),
+		Deployments:              c.GetAllDeployments(),
+		StatefulSets:             c.GetAllStatefulSets(),
+		ReplicaSets:              c.GetAllReplicaSets(),
+		PersistentVolumes:        c.GetAllPersistentVolumes(),
+		PersistentVolumeClaims:   c.GetAllPersistentVolumeClaims(),
+		StorageClasses:           c.GetAllStorageClasses(),
+		Jobs:                     c.GetAllJobs(),
+		HorizontalPodAutoscalers: c.GetAllHorizontalPodAutoscalers(),
+		PodDisruptionBudgets:     c.GetAllPodDisruptionBudgets(),
+	}
+
+	data, err := json.Marshal(encoding)
+	if err != nil {
+		return err
+	}
+
+	return ce.target.Write(data)
+}

+ 309 - 0
pkg/clustercache/clusterimporter.go

@@ -0,0 +1,309 @@
+package clustercache
+
+import (
+	"sync"
+
+	"github.com/kubecost/cost-model/pkg/config"
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/util/json"
+	appsv1 "k8s.io/api/apps/v1"
+	autoscaling "k8s.io/api/autoscaling/v2beta1"
+	batchv1 "k8s.io/api/batch/v1"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/api/policy/v1beta1"
+	stv1 "k8s.io/api/storage/v1"
+)
+
+// ClusterImporter is an implementation of ClusterCache which leverages a backing configuration file
+// as it's source of the cluster data.
+type ClusterImporter struct {
+	source          *config.ConfigFile
+	sourceHandlerID config.HandlerID
+	dataLock        *sync.Mutex
+	data            *clusterEncoding
+}
+
+// Creates a new ClusterCache implementation which uses an import process to provide cluster data
+func NewClusterImporter(source *config.ConfigFile) ClusterCache {
+	return &ClusterImporter{
+		source:   source,
+		dataLock: new(sync.Mutex),
+		data:     new(clusterEncoding),
+	}
+}
+
+// onImportSourceChanged handles the source data updating
+func (ci *ClusterImporter) onImportSourceChanged(changeType config.ChangeType, data []byte) {
+	if changeType == config.ChangeTypeDeleted {
+		ci.dataLock.Lock()
+		ci.data = new(clusterEncoding)
+		ci.dataLock.Unlock()
+		return
+	}
+
+	ci.update(data)
+}
+
+// update replaces the underlying cluster data with the provided new data if it decodes
+func (ci *ClusterImporter) update(data []byte) {
+	ce := new(clusterEncoding)
+	err := json.Unmarshal(data, ce)
+	if err != nil {
+		log.Warningf("Failed to unmarshal cluster during import: %s", err)
+		return
+	}
+
+	ci.dataLock.Lock()
+	ci.data = ce
+	ci.dataLock.Unlock()
+}
+
+// Run starts the watcher processes
+func (ci *ClusterImporter) Run() {
+	if ci.source == nil {
+		log.Errorf("ClusterImporter source does not exist, not running")
+		return
+	}
+
+	exists, err := ci.source.Exists()
+	if err != nil {
+		log.Errorf("Failed to import source for cluster: %s", err)
+		return
+	}
+
+	if exists {
+		data, err := ci.source.Read()
+		if err != nil {
+			log.Warningf("Failed to import cluster: %s", err)
+		} else {
+			ci.update(data)
+		}
+	}
+
+	ci.sourceHandlerID = ci.source.AddChangeHandler(ci.onImportSourceChanged)
+}
+
+// Stops the watcher processes
+func (ci *ClusterImporter) Stop() {
+	if ci.sourceHandlerID != "" {
+		ci.source.RemoveChangeHandler(ci.sourceHandlerID)
+		ci.sourceHandlerID = ""
+	}
+}
+
+// GetAllNamespaces returns all the cached namespaces
+func (ci *ClusterImporter) GetAllNamespaces() []*v1.Namespace {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	namespaces := ci.data.Namespaces
+	cloneList := make([]*v1.Namespace, 0, len(namespaces))
+	for _, v := range namespaces {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllNodes returns all the cached nodes
+func (ci *ClusterImporter) GetAllNodes() []*v1.Node {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	nodes := ci.data.Nodes
+	cloneList := make([]*v1.Node, 0, len(nodes))
+	for _, v := range nodes {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllPods returns all the cached pods
+func (ci *ClusterImporter) GetAllPods() []*v1.Pod {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	pods := ci.data.Pods
+	cloneList := make([]*v1.Pod, 0, len(pods))
+	for _, v := range pods {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllServices returns all the cached services
+func (ci *ClusterImporter) GetAllServices() []*v1.Service {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	services := ci.data.Services
+	cloneList := make([]*v1.Service, 0, len(services))
+	for _, v := range services {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllDaemonSets returns all the cached DaemonSets
+func (ci *ClusterImporter) GetAllDaemonSets() []*appsv1.DaemonSet {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	daemonSets := ci.data.DaemonSets
+	cloneList := make([]*appsv1.DaemonSet, 0, len(daemonSets))
+	for _, v := range daemonSets {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllDeployments returns all the cached deployments
+func (ci *ClusterImporter) GetAllDeployments() []*appsv1.Deployment {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	deployments := ci.data.Deployments
+	cloneList := make([]*appsv1.Deployment, 0, len(deployments))
+	for _, v := range deployments {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllStatfulSets returns all the cached StatefulSets
+func (ci *ClusterImporter) GetAllStatefulSets() []*appsv1.StatefulSet {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	statefulSets := ci.data.StatefulSets
+	cloneList := make([]*appsv1.StatefulSet, 0, len(statefulSets))
+	for _, v := range statefulSets {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllReplicaSets returns all the cached ReplicaSets
+func (ci *ClusterImporter) GetAllReplicaSets() []*appsv1.ReplicaSet {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	replicaSets := ci.data.ReplicaSets
+	cloneList := make([]*appsv1.ReplicaSet, 0, len(replicaSets))
+	for _, v := range replicaSets {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllPersistentVolumes returns all the cached persistent volumes
+func (ci *ClusterImporter) GetAllPersistentVolumes() []*v1.PersistentVolume {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	pvs := ci.data.PersistentVolumes
+	cloneList := make([]*v1.PersistentVolume, 0, len(pvs))
+	for _, v := range pvs {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllPersistentVolumeClaims returns all the cached persistent volume claims
+func (ci *ClusterImporter) GetAllPersistentVolumeClaims() []*v1.PersistentVolumeClaim {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	pvcs := ci.data.PersistentVolumeClaims
+	cloneList := make([]*v1.PersistentVolumeClaim, 0, len(pvcs))
+	for _, v := range pvcs {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllStorageClasses returns all the cached storage classes
+func (ci *ClusterImporter) GetAllStorageClasses() []*stv1.StorageClass {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	storageClasses := ci.data.StorageClasses
+	cloneList := make([]*stv1.StorageClass, 0, len(storageClasses))
+	for _, v := range storageClasses {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllJobs returns all the cached jobs
+func (ci *ClusterImporter) GetAllJobs() []*batchv1.Job {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	jobs := ci.data.Jobs
+	cloneList := make([]*batchv1.Job, 0, len(jobs))
+	for _, v := range jobs {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllHorizontalPodAutoscalers() returns all cached horizontal pod autoscalers
+func (ci *ClusterImporter) GetAllHorizontalPodAutoscalers() []*autoscaling.HorizontalPodAutoscaler {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	hpas := ci.data.HorizontalPodAutoscalers
+	cloneList := make([]*autoscaling.HorizontalPodAutoscaler, 0, len(hpas))
+	for _, v := range hpas {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// GetAllPodDisruptionBudgets returns all cached pod disruption budgets
+func (ci *ClusterImporter) GetAllPodDisruptionBudgets() []*v1beta1.PodDisruptionBudget {
+	ci.dataLock.Lock()
+	defer ci.dataLock.Unlock()
+
+	// Deep copy here to avoid callers from corrupting the cache
+	// This also mimics the behavior of the default cluster cache impl.
+	pdbs := ci.data.PodDisruptionBudgets
+	cloneList := make([]*v1beta1.PodDisruptionBudget, 0, len(pdbs))
+	for _, v := range pdbs {
+		cloneList = append(cloneList, v.DeepCopy())
+	}
+	return cloneList
+}
+
+// SetConfigMapUpdateFunc sets the configmap update function
+func (ci *ClusterImporter) SetConfigMapUpdateFunc(_ func(interface{})) {
+	// TODO: (bolt) This function is still a bit strange to me for the ClusterCache interface.
+	// TODO: (bolt) no-op for now.
+	log.Warningf("SetConfigMapUpdateFunc is disabled for imported cluster data.")
+}

+ 364 - 0
pkg/config/configfile.go

@@ -0,0 +1,364 @@
+package config
+
+import (
+	"errors"
+	"os"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/storage"
+	"github.com/kubecost/cost-model/pkg/util/atomic"
+)
+
+// HandlerID is a unique identifier assigned to a provided ConfigChangedHandler. This is used to remove a handler
+// from the ConfigFile when it is no longer needed.
+type HandlerID string
+
+//--------------------------------------------------------------------------
+//  ChangeType
+//--------------------------------------------------------------------------
+
+// ChangeType is used to specifically categorize the change that was made on a ConfigFile
+type ChangeType string
+
+// ChangeType constants contain the different types of updates passed through the ConfigChangedHandler
+const (
+	ChangeTypeCreated  ChangeType = "created"
+	ChangeTypeModified ChangeType = "modified"
+	ChangeTypeDeleted  ChangeType = "deleted"
+)
+
+//--------------------------------------------------------------------------
+//  ConfigChangedHandler
+//--------------------------------------------------------------------------
+
+// ConfigChangedHandler is the func handler used to receive change updates about the
+// config file. Both ChangeTypeCreated and ChangeTypeModified yield a valid []byte, while
+// ChangeTypeDeleted yields a nil []byte.
+type ConfigChangedHandler func(ChangeType, []byte)
+
+//--------------------------------------------------------------------------
+//  ConfigFile
+//--------------------------------------------------------------------------
+
+// DefaultHandlerPriority is used as the priority for any handlers added via AddChangeHandler
+const DefaultHandlerPriority int = 1000
+
+// NoBackingStore error is used when the config file's backing storage is missing
+var NoBackingStore error = errors.New("Backing storage does not exist.")
+
+// ConfigFile is representation of a configuration file that can be written to, read, and watched
+// for updates
+type ConfigFile struct {
+	store      storage.Storage
+	file       string
+	dataLock   *sync.Mutex
+	data       []byte
+	watchLock  *sync.Mutex
+	watchers   []*pHandler
+	runState   atomic.AtomicRunState
+	lastChange time.Time
+}
+
+// NewConfigFile creates a new ConfigFile instance using a specific storage.Storage and path relative
+// to the storage.
+func NewConfigFile(store storage.Storage, file string) *ConfigFile {
+	return &ConfigFile{
+		store:     store,
+		file:      file,
+		dataLock:  new(sync.Mutex),
+		data:      nil,
+		watchLock: new(sync.Mutex),
+	}
+}
+
+// Path returns the fully qualified path of the config file.
+func (cf *ConfigFile) Path() string {
+	if cf.store == nil {
+		return cf.file
+	}
+
+	return cf.store.FullPath(cf.file)
+}
+
+// Write will write the binary data to the file.
+func (cf *ConfigFile) Write(data []byte) error {
+	if cf.store == nil {
+		return NoBackingStore
+	}
+
+	e := cf.store.Write(cf.file, data)
+	// update cache on successful write
+	if e == nil {
+		cf.dataLock.Lock()
+		cf.data = data
+		cf.dataLock.Unlock()
+	}
+	return e
+}
+
+// Read will read the binary data from the file and return it. If an error is returned,
+// the byte array will be nil.
+func (cf *ConfigFile) Read() ([]byte, error) {
+	return cf.internalRead(false)
+}
+
+// internalRead is used to allow a forced override of data cache to refresh data
+func (cf *ConfigFile) internalRead(force bool) ([]byte, error) {
+	if cf.store == nil {
+		return nil, NoBackingStore
+	}
+
+	cf.dataLock.Lock()
+	defer cf.dataLock.Unlock()
+	if !force {
+		if cf.data != nil {
+			return cf.data, nil
+		}
+	}
+
+	d, e := cf.store.Read(cf.file)
+	if e != nil {
+		return nil, e
+	}
+	cf.data = d
+	return cf.data, nil
+}
+
+// Stat returns the StorageStats for the file.
+func (cf *ConfigFile) Stat() (*storage.StorageInfo, error) {
+	if cf.store == nil {
+		return nil, NoBackingStore
+	}
+
+	return cf.store.Stat(cf.file)
+}
+
+// Exists returns true if the file exist. If an error other than a NotExist error is returned,
+// the result will be false with the provided error.
+func (cf *ConfigFile) Exists() (bool, error) {
+	if cf.store == nil {
+		return false, NoBackingStore
+	}
+
+	return cf.store.Exists(cf.file)
+}
+
+// Delete removes the file from storage permanently.
+func (cf *ConfigFile) Delete() error {
+	if cf.store == nil {
+		return NoBackingStore
+	}
+
+	e := cf.store.Remove(cf.file)
+
+	// on removal, clear data cache
+	if e == nil {
+		cf.dataLock.Lock()
+		cf.data = nil
+		cf.dataLock.Unlock()
+	}
+	return e
+}
+
+// Refresh allows external callers to force reload the config file from internal storage. This is
+// particularly useful when there exist no change listeners on the config, which would prevent the
+// data cache from automatically updating on change
+func (cf *ConfigFile) Refresh() ([]byte, error) {
+	return cf.internalRead(true)
+}
+
+// AddChangeHandler accepts a ConfigChangedHandler function which will be called whenever the implementation
+// detects that a change has been made. A unique HandlerID is returned that can be used to remove the handler
+// if necessary.
+func (cf *ConfigFile) AddChangeHandler(handler ConfigChangedHandler) HandlerID {
+	return cf.AddPriorityChangeHandler(handler, DefaultHandlerPriority)
+}
+
+// AddPriorityChangeHandler allows adding a config change handler with a specific priority. By default,
+// any handlers added via AddChangeHandler have a default priority of 1000. The lower the priority, the
+// sooner in the handler execution it will be called.
+func (cf *ConfigFile) AddPriorityChangeHandler(handler ConfigChangedHandler, priority int) HandlerID {
+	cf.watchLock.Lock()
+	defer cf.watchLock.Unlock()
+
+	h := &pHandler{
+		id:       HandlerID(uuid.NewString()),
+		handler:  handler,
+		priority: priority,
+	}
+
+	cf.watchers = append(cf.watchers, h)
+
+	// create the actual file watcher once we have at least one active watcher func registered
+	if len(cf.watchers) == 1 {
+		cf.runWatcher()
+	}
+
+	return h.id
+}
+
+// RemoveChangeHandler removes the change handler with the provided identifier if it exists. True
+// is returned if the handler was removed (it existed), false otherwise.
+func (cf *ConfigFile) RemoveChangeHandler(id HandlerID) bool {
+	cf.watchLock.Lock()
+	defer cf.watchLock.Unlock()
+
+	for i := range cf.watchers {
+		if cf.watchers[i] != nil && cf.watchers[i].id == id {
+			copy(cf.watchers[i:], cf.watchers[i+1:])
+			cf.watchers[len(cf.watchers)-1] = nil
+			cf.watchers = cf.watchers[:len(cf.watchers)-1]
+
+			// stop watching the file for changes if there are no more external watchers
+			if len(cf.watchers) == 0 {
+				cf.stopWatcher()
+			}
+
+			return true
+		}
+	}
+	return false
+}
+
+// RemoveAllHandlers removes all added handlers
+func (cf *ConfigFile) RemoveAllHandlers() {
+	cf.watchLock.Lock()
+	defer cf.watchLock.Unlock()
+
+	cf.watchers = nil
+
+	cf.stopWatcher()
+}
+
+// runWatcher creates a go routine which will poll the stat of a storage target on a specific
+// interval and dispatch created, modified, and deleted events for that file.
+func (cf *ConfigFile) runWatcher() {
+	// we wait for a reset on the run state prior to starting, which
+	// will only block iff the run state is in the process of stopping
+	cf.runState.WaitForReset()
+
+	// if start fails after waiting for a reset, it means that another thread
+	// beat this thread to the start
+	if !cf.runState.Start() {
+		log.Warningf("Run watcher already running for file: %s", cf.file)
+		return
+	}
+
+	go func() {
+		first := true
+
+		var last time.Time
+		var exists bool
+
+		for {
+			// Each iteration, check for the stop trigger, or wait 10 seconds
+			select {
+			case <-cf.runState.OnStop():
+				cf.runState.Reset()
+				return
+			case <-time.After(10 * time.Second):
+			}
+
+			// Query stat on the file, on errors other than exists,
+			// we'll need to log the error, and perhaps limit the retries
+			st, err := cf.Stat()
+			if err != nil && !os.IsNotExist(err) {
+				log.Errorf("Storage Stat Error: %s", err)
+				continue
+			}
+
+			// On first iteration, set exists and last modification time (if applicable)
+			// and flip flag
+			if first {
+				exists = !os.IsNotExist(err)
+				if exists {
+					last = st.ModTime
+				}
+				first = false
+				continue
+			}
+
+			// File does not exist in storage, need to check to see if that is different
+			// from last state check
+			if os.IsNotExist(err) {
+				// check to see if the file has gone from exists to !exists
+				if exists {
+					exists = false
+					cf.onFileChange(ChangeTypeDeleted, nil)
+				}
+				continue
+			}
+
+			// check to see if the file has gone from !exists to exists
+			if !exists {
+				data, err := cf.internalRead(true)
+				if err != nil {
+					log.Warningf("Read() Error: %s\n", err)
+					continue
+				}
+				exists = true
+				last = st.ModTime
+				cf.onFileChange(ChangeTypeCreated, data)
+				continue
+			}
+
+			mtime := st.ModTime
+			if mtime != last {
+				last = mtime
+				data, err := cf.internalRead(true)
+				if err != nil {
+					log.Errorf("Read() Error: %s\n", err)
+					continue
+				}
+				cf.onFileChange(ChangeTypeModified, data)
+			}
+		}
+	}()
+}
+
+// stopWatcher closes the stop channel, returning from the runWatcher go routine. Allows us
+// to remove any polling stat checks on files when there are no change handlers.
+func (cf *ConfigFile) stopWatcher() {
+	cf.runState.Stop()
+}
+
+// onFileChange is internally called when the core watcher recognizes a change in the ConfigFile. This
+// method dispatches that change to all added watchers
+func (cf *ConfigFile) onFileChange(changeType ChangeType, newData []byte) {
+	// On change, we copy out the handlers to a separate slice for processing for a few reasons:
+	// 1. We don't want to lock while executing the handlers
+	// 2. Handlers may want to operate on the ConfigFile instance, which would result in a deadlock
+	// 3. Allows us to implement priority sorting outside of the lock as well
+	cf.watchLock.Lock()
+	if len(cf.watchers) == 0 {
+		cf.watchLock.Unlock()
+		return
+	}
+
+	toNotify := make([]*pHandler, len(cf.watchers))
+	copy(toNotify, cf.watchers)
+	cf.watchLock.Unlock()
+
+	sort.SliceStable(toNotify, func(i, j int) bool {
+		return toNotify[i].priority < toNotify[j].priority
+	})
+
+	for _, handler := range toNotify {
+		handler.handler(changeType, newData)
+	}
+}
+
+//--------------------------------------------------------------------------
+//  pHandler
+//--------------------------------------------------------------------------
+
+// pHandler is a wrapper type used to assign a ConfigChangedHandler a unique identifier and priority.
+type pHandler struct {
+	id       HandlerID
+	handler  ConfigChangedHandler
+	priority int
+}

+ 98 - 0
pkg/config/configmanager.go

@@ -0,0 +1,98 @@
+package config
+
+import (
+	"io/ioutil"
+	"sync"
+
+	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/storage"
+)
+
+//--------------------------------------------------------------------------
+//  ConfigFileManagerOpts
+//--------------------------------------------------------------------------
+
+// ConfigFileManagerOpts describes how to configure the ConfigFileManager for
+// serving configuration files
+type ConfigFileManagerOpts struct {
+	// BucketStoreConfig is the local file location for the configuration used to
+	// write and read configuration data to/from the bucket. The format of this
+	// configuration file should be compatible with storage.NewBucketStorage
+	BucketStoreConfig string
+
+	// LocalConfigPath provides a backup location for storing the configuration
+	// files
+	LocalConfigPath string
+}
+
+// IsBucketStorageEnabled returns true if bucket storage is enabled.
+func (cfmo *ConfigFileManagerOpts) IsBucketStorageEnabled() bool {
+	return cfmo.BucketStoreConfig != ""
+}
+
+// DefaultConfigFileManagerOpts returns the default configuration options for the
+// config file manager
+func DefaultConfigFileManagerOpts() *ConfigFileManagerOpts {
+	return &ConfigFileManagerOpts{
+		BucketStoreConfig: "",
+		LocalConfigPath:   "/",
+	}
+}
+
+//--------------------------------------------------------------------------
+//  ConfigFileManager
+//--------------------------------------------------------------------------
+
+// ConfigFileManager is a fascade for a central API used to create and watch
+// config files.
+type ConfigFileManager struct {
+	lock  *sync.Mutex
+	store storage.Storage
+	files map[string]*ConfigFile
+}
+
+// NewConfigFileManager creates a new backing storage and configuration file manager
+func NewConfigFileManager(opts *ConfigFileManagerOpts) *ConfigFileManager {
+	if opts == nil {
+		opts = DefaultConfigFileManagerOpts()
+	}
+
+	var configStore storage.Storage
+	if opts.IsBucketStorageEnabled() {
+		bucketConfig, err := ioutil.ReadFile(opts.BucketStoreConfig)
+		if err != nil {
+			log.Warningf("Failed to initialize config bucket storage: %s", err)
+		} else {
+			bucketStore, err := storage.NewBucketStorage(bucketConfig)
+			if err != nil {
+				log.Warningf("Failed to create config bucket storage: %s", err)
+			} else {
+				configStore = bucketStore
+			}
+		}
+	} else {
+		configStore = storage.NewFileStorage(opts.LocalConfigPath)
+	}
+
+	return &ConfigFileManager{
+		lock:  new(sync.Mutex),
+		store: configStore,
+		files: make(map[string]*ConfigFile),
+	}
+}
+
+// ConfigFileAt returns an existing configuration file for the provided path if it exists. Otherwise,
+// a new instance is created and returned. Note that the path does not have to exist in order for the
+// instance to be created. It can exist as a potential file path on the storage, and be written to
+// later
+func (cfm *ConfigFileManager) ConfigFileAt(path string) *ConfigFile {
+	cfm.lock.Lock()
+	defer cfm.lock.Unlock()
+	if cf, ok := cfm.files[path]; ok {
+		return cf
+	}
+
+	cf := NewConfigFile(cfm.store, path)
+	cfm.files[path] = cf
+	return cf
+}

+ 82 - 21
pkg/costmodel/clusterinfo.go

@@ -4,9 +4,12 @@ import (
 	"fmt"
 
 	cloudProvider "github.com/kubecost/cost-model/pkg/cloud"
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/costmodel/clusters"
 	"github.com/kubecost/cost-model/pkg/env"
+	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/thanos"
+	"github.com/kubecost/cost-model/pkg/util/json"
 
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/klog"
@@ -33,6 +36,7 @@ func writeClusterProfile(clusterInfo map[string]string) {
 	clusterInfo["clusterProfile"] = clusterProfile
 }
 
+// writeThanosFlags includes the configured thanos flags on the cluster info
 func writeThanosFlags(clusterInfo map[string]string) {
 	// Include Thanos Offset Duration if Applicable
 	clusterInfo["thanosEnabled"] = fmt.Sprintf("%t", thanos.IsEnabled())
@@ -41,38 +45,22 @@ func writeThanosFlags(clusterInfo map[string]string) {
 	}
 }
 
-// default local cluster info provider implementation which provides an instanced object for
-// getting the local cluster info
-type defaultLocalClusterInfoProvider struct {
+// localClusterInfoProvider gets the local cluster info from the cloud provider and kubernetes
+type localClusterInfoProvider struct {
 	k8s      kubernetes.Interface
 	provider cloudProvider.Provider
 }
 
 // GetClusterInfo returns a string map containing the local cluster info
-func (dlcip *defaultLocalClusterInfoProvider) GetClusterInfo() map[string]string {
-	return GetClusterInfo(dlcip.k8s, dlcip.provider)
-}
-
-// NewLocalClusterInfoProvider creates a new clusters.LocalClusterInfoProvider implementation for providing local
-// cluster information
-func NewLocalClusterInfoProvider(k8s kubernetes.Interface, cloud cloudProvider.Provider) clusters.LocalClusterInfoProvider {
-	return &defaultLocalClusterInfoProvider{
-		k8s:      k8s,
-		provider: cloud,
-	}
-}
-
-// GetClusterInfo provides specific information about the cluster cloud provider as well as
-// generic configuration values.
-func GetClusterInfo(kubeClient kubernetes.Interface, cloud cloudProvider.Provider) map[string]string {
-	data, err := cloud.ClusterInfo()
+func (dlcip *localClusterInfoProvider) GetClusterInfo() map[string]string {
+	data, err := dlcip.provider.ClusterInfo()
 
 	// Ensure we create the info object if it doesn't exist
 	if data == nil {
 		data = make(map[string]string)
 	}
 
-	kc, ok := kubeClient.(*kubernetes.Clientset)
+	kc, ok := dlcip.k8s.(*kubernetes.Clientset)
 	if ok && data != nil {
 		v, err := kc.ServerVersion()
 		if err != nil {
@@ -90,3 +78,76 @@ func GetClusterInfo(kubeClient kubernetes.Interface, cloud cloudProvider.Provide
 
 	return data
 }
+
+// NewLocalClusterInfoProvider creates a new clusters.LocalClusterInfoProvider implementation for providing local
+// cluster information
+func NewLocalClusterInfoProvider(k8s kubernetes.Interface, cloud cloudProvider.Provider) clusters.ClusterInfoProvider {
+	return &localClusterInfoProvider{
+		k8s:      k8s,
+		provider: cloud,
+	}
+}
+
+// configuredClusterInfoProvider just provides the cluster info directly from the config file source.
+type configuredClusterInfoProvider struct {
+	config *config.ConfigFile
+}
+
+// GetClusterInfo returns a string map containing the local cluster info
+func (ccip *configuredClusterInfoProvider) GetClusterInfo() map[string]string {
+	clusterInfo := map[string]string{}
+
+	data, err := ccip.config.Refresh()
+	if err != nil {
+		return clusterInfo
+	}
+
+	err = json.Unmarshal(data, &clusterInfo)
+	if err != nil {
+		log.Warningf("ClusterInfo failed to load from configuration: %s", err)
+		return clusterInfo
+	}
+
+	return clusterInfo
+}
+
+// NewConfiguredClusterInfoProvider instantiates and returns a cluster info provider which loads cluster info from
+// a config file.
+func NewConfiguredClusterInfoProvider(config *config.ConfigFile) clusters.ClusterInfoProvider {
+	return &configuredClusterInfoProvider{
+		config: config,
+	}
+}
+
+// clusterInfoWriteOnRequest writes the cluster info result to a config whenever it's requested
+type clusterInfoWriteOnRequest struct {
+	clusterInfo clusters.ClusterInfoProvider
+	config      *config.ConfigFile
+}
+
+// GetClusterInfo returns a string map containing the local cluster info
+func (ciw *clusterInfoWriteOnRequest) GetClusterInfo() map[string]string {
+	cInfo := ciw.clusterInfo.GetClusterInfo()
+
+	result, err := json.Marshal(cInfo)
+	if err != nil {
+		log.Warningf("Failed to write the cluster info: %s", err)
+		return cInfo
+	}
+
+	err = ciw.config.Write(result)
+	if err != nil {
+		log.Warningf("Failed to write the cluster info to config: %s", err)
+	}
+
+	return cInfo
+}
+
+// NewClusterInfoWriteOnRequest instantiates and returns a cluster info provider which writes the cluster info to a configuration
+// before each request.
+func NewClusterInfoWriteOnRequest(clusterInfo clusters.ClusterInfoProvider, config *config.ConfigFile) clusters.ClusterInfoProvider {
+	return &clusterInfoWriteOnRequest{
+		clusterInfo: clusterInfo,
+		config:      config,
+	}
+}

+ 23 - 24
pkg/costmodel/clusters/clustermap.go

@@ -7,7 +7,6 @@ import (
 	"sync"
 	"time"
 
-	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/prom"
 	"github.com/kubecost/cost-model/pkg/thanos"
@@ -76,31 +75,31 @@ type ClusterMap interface {
 	StopRefresh()
 }
 
-// LocalClusterInfoProvider is a contract which is capable of performing local cluster info lookups.
-type LocalClusterInfoProvider interface {
-	// GetClusterInfo returns a string map containing the local cluster info
+// ClusterInfoProvider is a contract which is capable of performing cluster info lookups.
+type ClusterInfoProvider interface {
+	// GetClusterInfo returns a string map containing the local/remote connected cluster info
 	GetClusterInfo() map[string]string
 }
 
 // ClusterMap keeps records of all known cost-model clusters.
 type PrometheusClusterMap struct {
-	lock         *sync.RWMutex
-	client       prometheus.Client
-	clusters     map[string]*ClusterInfo
-	localCluster LocalClusterInfoProvider
-	stop         chan struct{}
+	lock        *sync.RWMutex
+	client      prometheus.Client
+	clusters    map[string]*ClusterInfo
+	clusterInfo ClusterInfoProvider
+	stop        chan struct{}
 }
 
 // NewClusterMap creates a new ClusterMap implementation using a prometheus or thanos client
-func NewClusterMap(client prometheus.Client, lcip LocalClusterInfoProvider, refresh time.Duration) ClusterMap {
+func NewClusterMap(client prometheus.Client, cip ClusterInfoProvider, refresh time.Duration) ClusterMap {
 	stop := make(chan struct{})
 
 	cm := &PrometheusClusterMap{
-		lock:         new(sync.RWMutex),
-		client:       client,
-		clusters:     make(map[string]*ClusterInfo),
-		localCluster: lcip,
-		stop:         stop,
+		lock:        new(sync.RWMutex),
+		client:      client,
+		clusters:    make(map[string]*ClusterInfo),
+		clusterInfo: cip,
+		stop:        stop,
 	}
 
 	// Run an updater to ensure cluster data stays relevant over time
@@ -210,14 +209,14 @@ func (pcm *PrometheusClusterMap) loadClusters() (map[string]*ClusterInfo, error)
 	}
 
 	// populate the local cluster if it doesn't exist
-	localID := env.GetClusterID()
-	if _, ok := clusters[localID]; !ok {
-		localInfo, err := pcm.getLocalClusterInfo()
-		if err != nil {
-			log.Warningf("Failed to load local cluster info: %s", err)
-		} else {
-			clusters[localInfo.ID] = localInfo
-		}
+	localInfo, err := pcm.getLocalClusterInfo()
+	if err != nil {
+		return clusters, nil
+	}
+
+	// Check to see if the local cluster's id is part of our loaded clusters, and include if not
+	if _, ok := clusters[localInfo.ID]; !ok {
+		clusters[localInfo.ID] = localInfo
 	}
 
 	return clusters, nil
@@ -225,7 +224,7 @@ func (pcm *PrometheusClusterMap) loadClusters() (map[string]*ClusterInfo, error)
 
 // getLocalClusterInfo returns the local cluster info in the event there does not exist a metric available.
 func (pcm *PrometheusClusterMap) getLocalClusterInfo() (*ClusterInfo, error) {
-	info := pcm.localCluster.GetClusterInfo()
+	info := pcm.clusterInfo.GetClusterInfo()
 
 	var id string
 	var name string

+ 19 - 52
pkg/costmodel/metrics.go

@@ -9,20 +9,20 @@ import (
 
 	"github.com/kubecost/cost-model/pkg/cloud"
 	"github.com/kubecost/cost-model/pkg/clustercache"
+	"github.com/kubecost/cost-model/pkg/costmodel/clusters"
 	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/errors"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/metrics"
 	"github.com/kubecost/cost-model/pkg/prom"
 	"github.com/kubecost/cost-model/pkg/util"
+	"github.com/kubecost/cost-model/pkg/util/atomic"
 
 	promclient "github.com/prometheus/client_golang/api"
 	"github.com/prometheus/client_golang/prometheus"
 	dto "github.com/prometheus/client_model/go"
 	v1 "k8s.io/api/core/v1"
 
-	"k8s.io/client-go/kubernetes"
-
 	"k8s.io/klog"
 )
 
@@ -32,8 +32,7 @@ import (
 
 // ClusterInfoCollector is a prometheus collector that generates ClusterInfoMetrics
 type ClusterInfoCollector struct {
-	Cloud         cloud.Provider
-	KubeClientSet kubernetes.Interface
+	ClusterInfo clusters.ClusterInfoProvider
 }
 
 // Describe sends the super-set of all possible descriptors of metrics
@@ -44,7 +43,7 @@ func (cic ClusterInfoCollector) Describe(ch chan<- *prometheus.Desc) {
 
 // Collect is called by the Prometheus registry when collecting metrics.
 func (cic ClusterInfoCollector) Collect(ch chan<- prometheus.Metric) {
-	clusterInfo := GetClusterInfo(cic.KubeClientSet, cic.Cloud)
+	clusterInfo := cic.ClusterInfo.GetClusterInfo()
 	labels := prom.MapToLabels(clusterInfo)
 
 	m := newClusterInfoMetric("kubecost_cluster_info", labels)
@@ -126,7 +125,7 @@ var (
 )
 
 // initCostModelMetrics uses a sync.Once to ensure that these metrics are only created once
-func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider cloud.Provider) {
+func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider cloud.Provider, clusterInfo clusters.ClusterInfoProvider) {
 	metricsInit.Do(func() {
 		cpuGv = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "node_cpu_hourly_cost",
@@ -216,8 +215,7 @@ func initCostModelMetrics(clusterCache clustercache.ClusterCache, provider cloud
 
 		// General Metric Collectors
 		prometheus.MustRegister(ClusterInfoCollector{
-			KubeClientSet: clusterCache.GetClient(),
-			Cloud:         provider,
+			ClusterInfo: clusterInfo,
 		})
 	})
 }
@@ -252,16 +250,14 @@ type CostModelMetricsEmitter struct {
 	NetworkRegionEgressRecorder   prometheus.Gauge
 	NetworkInternetEgressRecorder prometheus.Gauge
 
-	// Flow Control
-	recordingLock     *sync.Mutex
-	recordingStopping bool
-	recordingStop     chan bool
+	// Concurrent Flow Control - Manages the run state of the metric emitter
+	runState atomic.AtomicRunState
 }
 
 // NewCostModelMetricsEmitter creates a new cost-model metrics emitter. Use Start() to begin metric emission.
-func NewCostModelMetricsEmitter(promClient promclient.Client, clusterCache clustercache.ClusterCache, provider cloud.Provider, model *CostModel) *CostModelMetricsEmitter {
+func NewCostModelMetricsEmitter(promClient promclient.Client, clusterCache clustercache.ClusterCache, provider cloud.Provider, clusterInfo clusters.ClusterInfoProvider, model *CostModel) *CostModelMetricsEmitter {
 	// init will only actually execute once to register the custom gauges
-	initCostModelMetrics(clusterCache, provider)
+	initCostModelMetrics(clusterCache, provider, clusterInfo)
 
 	metrics.InitKubeMetrics(clusterCache, &metrics.KubeMetricsOpts{
 		EmitKubecostControllerMetrics: true,
@@ -292,33 +288,12 @@ func NewCostModelMetricsEmitter(promClient promclient.Client, clusterCache clust
 		NetworkInternetEgressRecorder: networkInternetEgressCostG,
 		ClusterManagementCostRecorder: clusterManagementCostGv,
 		LBCostRecorder:                lbCostGv,
-		recordingLock:                 new(sync.Mutex),
-		recordingStopping:             false,
-		recordingStop:                 nil,
-	}
-}
-
-// Checks to see if there is a metric recording stop channel. If it exists, a new
-// channel is not created and false is returned. If it doesn't exist, a new channel
-// is created and true is returned.
-func (cmme *CostModelMetricsEmitter) checkOrCreateRecordingChan() bool {
-	cmme.recordingLock.Lock()
-	defer cmme.recordingLock.Unlock()
-
-	if cmme.recordingStop != nil {
-		return false
 	}
-
-	cmme.recordingStop = make(chan bool, 1)
-	return true
 }
 
 // IsRunning returns true if metric recording is running.
 func (cmme *CostModelMetricsEmitter) IsRunning() bool {
-	cmme.recordingLock.Lock()
-	defer cmme.recordingLock.Unlock()
-
-	return cmme.recordingStop != nil
+	return cmme.runState.IsRunning()
 }
 
 // NodeCostAverages tracks a running average of a node's cost attributes.
@@ -333,10 +308,11 @@ type NodeCostAverages struct {
 // StartCostModelMetricRecording starts the go routine that emits metrics used to determine
 // cluster costs.
 func (cmme *CostModelMetricsEmitter) Start() bool {
-	// Check to see if we're already recording
-	// This function will create the stop recording channel and return true
-	// if it doesn't exist.
-	if !cmme.checkOrCreateRecordingChan() {
+	// wait for a reset to prevent a race between start and stop calls
+	cmme.runState.WaitForReset()
+
+	// Check to see if we're already recording, and atomically advance the run state to start if we're not
+	if !cmme.runState.Start() {
 		log.Errorf("Attempted to start cost model metric recording when it's already running.")
 		return false
 	}
@@ -699,11 +675,8 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 
 			select {
 			case <-time.After(time.Minute):
-			case <-cmme.recordingStop:
-				cmme.recordingLock.Lock()
-				cmme.recordingStopping = false
-				cmme.recordingStop = nil
-				cmme.recordingLock.Unlock()
+			case <-cmme.runState.OnStop():
+				cmme.runState.Reset()
 				return
 			}
 		}
@@ -715,11 +688,5 @@ func (cmme *CostModelMetricsEmitter) Start() bool {
 // Stop halts the metrics emission loop after the current emission is completed
 // or if the emission is paused.
 func (cmme *CostModelMetricsEmitter) Stop() {
-	cmme.recordingLock.Lock()
-	defer cmme.recordingLock.Unlock()
-
-	if !cmme.recordingStopping && cmme.recordingStop != nil {
-		cmme.recordingStopping = true
-		close(cmme.recordingStop)
-	}
+	cmme.runState.Stop()
 }

+ 163 - 123
pkg/costmodel/router.go

@@ -7,13 +7,13 @@ import (
 	"fmt"
 	"io/ioutil"
 	"net/http"
-	"os"
 	"reflect"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/services"
 	"github.com/kubecost/cost-model/pkg/util/httputil"
 	"github.com/kubecost/cost-model/pkg/util/timeutil"
@@ -23,8 +23,6 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/klog"
 
-	k8serrors "k8s.io/apimachinery/pkg/api/errors"
-
 	"github.com/julienschmidt/httprouter"
 
 	sentry "github.com/getsentry/sentry-go"
@@ -41,6 +39,7 @@ import (
 	"github.com/kubecost/cost-model/pkg/util/json"
 	prometheus "github.com/prometheus/client_golang/api"
 	prometheusAPI "github.com/prometheus/client_golang/api/prometheus/v1"
+	appsv1 "k8s.io/api/apps/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	"github.com/patrickmn/go-cache"
@@ -72,20 +71,23 @@ var (
 // Accesses defines a singleton application instance, providing access to
 // Prometheus, Kubernetes, the cloud provider, and caches.
 type Accesses struct {
-	Router            *httprouter.Router
-	PrometheusClient  prometheus.Client
-	ThanosClient      prometheus.Client
-	KubeClientSet     kubernetes.Interface
-	ClusterMap        clusters.ClusterMap
-	CloudProvider     cloud.Provider
-	Model             *CostModel
-	MetricsEmitter    *CostModelMetricsEmitter
-	OutOfClusterCache *cache.Cache
-	AggregateCache    *cache.Cache
-	CostDataCache     *cache.Cache
-	ClusterCostsCache *cache.Cache
-	CacheExpiration   map[time.Duration]time.Duration
-	AggAPI            Aggregator
+	Router              *httprouter.Router
+	PrometheusClient    prometheus.Client
+	ThanosClient        prometheus.Client
+	KubeClientSet       kubernetes.Interface
+	ClusterCache        clustercache.ClusterCache
+	ClusterMap          clusters.ClusterMap
+	CloudProvider       cloud.Provider
+	ConfigFileManager   *config.ConfigFileManager
+	ClusterInfoProvider clusters.ClusterInfoProvider
+	Model               *CostModel
+	MetricsEmitter      *CostModelMetricsEmitter
+	OutOfClusterCache   *cache.Cache
+	AggregateCache      *cache.Cache
+	CostDataCache       *cache.Cache
+	ClusterCostsCache   *cache.Cache
+	CacheExpiration     map[time.Duration]time.Duration
+	AggAPI              Aggregator
 	// SettingsCache stores current state of app settings
 	SettingsCache *cache.Cache
 	// settingsSubscribers tracks channels through which changes to different
@@ -352,6 +354,14 @@ func WrapDataWithMessageAndWarning(data interface{}, err error, message, warning
 	return resp
 }
 
+// wrapAsObjectItems wraps a slice of items into an object containing a single items list
+// allows our k8s proxy methods to emulate a List() request to k8s API
+func wrapAsObjectItems(items interface{}) map[string]interface{} {
+	return map[string]interface{}{
+		"items": items,
+	}
+}
+
 // RefreshPricingData needs to be called when a new node joins the fleet, since we cache the relevant subsets of pricing data to avoid storing the whole thing.
 func (a *Accesses) RefreshPricingData(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 	w.Header().Set("Content-Type", "application/json")
@@ -694,7 +704,7 @@ func (a *Accesses) ClusterInfo(w http.ResponseWriter, r *http.Request, ps httpro
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	data := GetClusterInfo(a.KubeClientSet, a.CloudProvider)
+	data := a.ClusterInfoProvider.GetClusterInfo()
 
 	w.Write(WrapData(data, nil))
 }
@@ -928,12 +938,9 @@ func (a *Accesses) GetAllPersistentVolumes(w http.ResponseWriter, r *http.Reques
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	pvList, err := a.KubeClientSet.CoreV1().PersistentVolumes().List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting persistent volume %v\n", err)
-	}
+	pvList := a.ClusterCache.GetAllPersistentVolumes()
 
-	body, err := json.Marshal(pvList)
+	body, err := json.Marshal(wrapAsObjectItems(pvList))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding persistent volumes: "+err.Error())
 	} else {
@@ -945,12 +952,28 @@ func (a *Accesses) GetAllPersistentVolumes(w http.ResponseWriter, r *http.Reques
 func (a *Accesses) GetAllDeployments(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
-	namespace := r.URL.Query().Get("namespace")
-	deploymentsList, err := a.KubeClientSet.AppsV1().Deployments(namespace).List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting deployments %v\n", err)
+
+	qp := httputil.NewQueryParams(r.URL.Query())
+
+	namespace := qp.Get("namespace", "")
+
+	deploymentsList := a.ClusterCache.GetAllDeployments()
+
+	// filter for provided namespace
+	var deployments []*appsv1.Deployment
+	if namespace == "" {
+		deployments = deploymentsList
+	} else {
+		deployments = []*appsv1.Deployment{}
+
+		for _, d := range deploymentsList {
+			if d.Namespace == namespace {
+				deployments = append(deployments, d)
+			}
+		}
 	}
-	body, err := json.Marshal(deploymentsList)
+
+	body, err := json.Marshal(wrapAsObjectItems(deployments))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding deployment: "+err.Error())
 	} else {
@@ -962,11 +985,9 @@ func (a *Accesses) GetAllStorageClasses(w http.ResponseWriter, r *http.Request,
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	scList, err := a.KubeClientSet.StorageV1().StorageClasses().List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting storageclasses: "+err.Error())
-	}
-	body, err := json.Marshal(scList)
+	scList := a.ClusterCache.GetAllStorageClasses()
+
+	body, err := json.Marshal(wrapAsObjectItems(scList))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding storageclasses: "+err.Error())
 	} else {
@@ -977,12 +998,28 @@ func (a *Accesses) GetAllStorageClasses(w http.ResponseWriter, r *http.Request,
 func (a *Accesses) GetAllStatefulSets(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
-	namespace := r.URL.Query().Get("namespace")
-	deploymentsList, err := a.KubeClientSet.AppsV1().StatefulSets(namespace).List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting deployments %v\n", err)
+
+	qp := httputil.NewQueryParams(r.URL.Query())
+
+	namespace := qp.Get("namespace", "")
+
+	statefulSetsList := a.ClusterCache.GetAllStatefulSets()
+
+	// filter for provided namespace
+	var statefulSets []*appsv1.StatefulSet
+	if namespace == "" {
+		statefulSets = statefulSetsList
+	} else {
+		statefulSets = []*appsv1.StatefulSet{}
+
+		for _, ss := range statefulSetsList {
+			if ss.Namespace == namespace {
+				statefulSets = append(statefulSets, ss)
+			}
+		}
 	}
-	body, err := json.Marshal(deploymentsList)
+
+	body, err := json.Marshal(wrapAsObjectItems(statefulSets))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding deployment: "+err.Error())
 	} else {
@@ -994,12 +1031,9 @@ func (a *Accesses) GetAllNodes(w http.ResponseWriter, r *http.Request, ps httpro
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	nodeList, err := a.KubeClientSet.CoreV1().Nodes().List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting node %v\n", err)
-	}
+	nodeList := a.ClusterCache.GetAllNodes()
 
-	body, err := json.Marshal(nodeList)
+	body, err := json.Marshal(wrapAsObjectItems(nodeList))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding nodes: "+err.Error())
 	} else {
@@ -1011,12 +1045,9 @@ func (a *Accesses) GetAllPods(w http.ResponseWriter, r *http.Request, ps httprou
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	podlist, err := a.KubeClientSet.CoreV1().Pods("").List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting pod %v\n", err)
-	}
+	podlist := a.ClusterCache.GetAllPods()
 
-	body, err := json.Marshal(podlist)
+	body, err := json.Marshal(wrapAsObjectItems(podlist))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding pods: "+err.Error())
 	} else {
@@ -1028,11 +1059,9 @@ func (a *Accesses) GetAllNamespaces(w http.ResponseWriter, r *http.Request, ps h
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	namespaces, err := a.KubeClientSet.CoreV1().Namespaces().List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting namespaces %v\n", err)
-	}
-	body, err := json.Marshal(namespaces)
+	namespaces := a.ClusterCache.GetAllNamespaces()
+
+	body, err := json.Marshal(wrapAsObjectItems(namespaces))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding deployment: "+err.Error())
 	} else {
@@ -1043,11 +1072,10 @@ func (a *Accesses) GetAllNamespaces(w http.ResponseWriter, r *http.Request, ps h
 func (a *Accesses) GetAllDaemonSets(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
-	daemonSets, err := a.KubeClientSet.AppsV1().DaemonSets("").List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting daemon sets %v\n", err)
-	}
-	body, err := json.Marshal(daemonSets)
+
+	daemonSets := a.ClusterCache.GetAllDaemonSets()
+
+	body, err := json.Marshal(wrapAsObjectItems(daemonSets))
 	if err != nil {
 		fmt.Fprintf(w, "Error decoding daemon set: "+err.Error())
 	} else {
@@ -1062,24 +1090,21 @@ func (a *Accesses) GetPod(w http.ResponseWriter, r *http.Request, ps httprouter.
 	podName := ps.ByName("name")
 	podNamespace := ps.ByName("namespace")
 
-	// Examples for error handling:
-	// - Use helper functions like e.g. errors.IsNotFound()
-	// - And/or cast to StatusError and use its properties like e.g. ErrStatus.Message
-	pod, err := a.KubeClientSet.CoreV1().Pods(podNamespace).Get(r.Context(), podName, metav1.GetOptions{})
-	if k8serrors.IsNotFound(err) {
-		fmt.Fprintf(w, "Pod not found\n")
-	} else if statusError, isStatus := err.(*k8serrors.StatusError); isStatus {
-		fmt.Fprintf(w, "Error getting pod %v\n", statusError.ErrStatus.Message)
-	} else if err != nil {
-		fmt.Fprintf(w, "Error getting pod: "+err.Error())
-	} else {
-		body, err := json.Marshal(pod)
-		if err != nil {
-			fmt.Fprintf(w, "Error decoding pod: "+err.Error())
-		} else {
-			w.Write(body)
+	// TODO: ClusterCache API could probably afford to have some better filtering
+	allPods := a.ClusterCache.GetAllPods()
+	for _, pod := range allPods {
+		if pod.Namespace == podNamespace && pod.Name == podName {
+			body, err := json.Marshal(pod)
+			if err != nil {
+				fmt.Fprintf(w, "Error decoding pod: "+err.Error())
+			} else {
+				w.Write(body)
+			}
+			return
 		}
 	}
+
+	fmt.Fprintf(w, "Pod not found\n")
 }
 
 func (a *Accesses) PrometheusRecordingRules(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
@@ -1105,9 +1130,9 @@ func (a *Accesses) PrometheusConfig(w http.ResponseWriter, r *http.Request, _ ht
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	pConfig := make(map[string]string)
-
-	pConfig["address"] = os.Getenv("PROMETHEUS_SERVER_ENDPOINT")
+	pConfig := map[string]string{
+		"address": env.GetPrometheusServerEndpoint(),
+	}
 
 	body, err := json.Marshal(pConfig)
 	if err != nil {
@@ -1140,13 +1165,10 @@ func (a *Accesses) GetOrphanedPods(w http.ResponseWriter, r *http.Request, ps ht
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	podlist, err := a.KubeClientSet.CoreV1().Pods("").List(r.Context(), metav1.ListOptions{})
-	if err != nil {
-		fmt.Fprintf(w, "Error getting pod %v\n", err)
-	}
+	podlist := a.ClusterCache.GetAllPods()
 
-	var lonePods []v1.Pod
-	for _, pod := range podlist.Items {
+	var lonePods []*v1.Pod
+	for _, pod := range podlist {
 		if len(pod.OwnerReferences) == 0 {
 			lonePods = append(lonePods, pod)
 		}
@@ -1164,10 +1186,11 @@ func (a *Accesses) GetInstallNamespace(w http.ResponseWriter, r *http.Request, _
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	ns := os.Getenv("KUBECOST_NAMESPACE")
+	ns := env.GetKubecostNamespace()
 	w.Write([]byte(ns))
 }
 
+// logsFor pulls the logs for a specific pod, namespace, and container
 func logsFor(c kubernetes.Interface, namespace string, pod string, container string, dur time.Duration, ctx context.Context) (string, error) {
 	since := time.Now().UTC().Add(-dur)
 
@@ -1196,18 +1219,13 @@ func (a *Accesses) GetPodLogs(w http.ResponseWriter, r *http.Request, ps httprou
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	q := r.URL.Query()
-	ns := q.Get("namespace")
-	if ns == "" {
-		ns = os.Getenv("KUBECOST_NAMESPACE")
-	}
-	pod := q.Get("pod")
-	selector := q.Get("selector")
-	container := q.Get("container")
-	since := q.Get("since")
-	if since == "" {
-		since = "24h"
-	}
+	qp := httputil.NewQueryParams(r.URL.Query())
+
+	ns := qp.Get("namespace", env.GetKubecostNamespace())
+	pod := qp.Get("pod", "")
+	selector := qp.Get("selector", "")
+	container := qp.Get("container", "")
+	since := qp.Get("since", "24h")
 
 	sinceDuration, err := time.ParseDuration(since)
 	if err != nil {
@@ -1284,8 +1302,6 @@ func (a *Accesses) AddServiceKey(w http.ResponseWriter, r *http.Request, ps http
 
 	r.ParseForm()
 
-	//p.CloudProvider.AddServiceKey(r.PostForm)
-
 	key := r.PostForm.Get("key")
 	k := []byte(key)
 	err := ioutil.WriteFile("/var/configs/key.json", k, 0644)
@@ -1300,7 +1316,7 @@ func (a *Accesses) GetHelmValues(w http.ResponseWriter, r *http.Request, ps http
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
-	encodedValues := os.Getenv("HELM_VALUES")
+	encodedValues := env.Get("HELM_VALUES", "")
 	if encodedValues == "" {
 		fmt.Fprintf(w, "Values reporting disabled")
 		return
@@ -1319,14 +1335,15 @@ func (a *Accesses) Status(w http.ResponseWriter, r *http.Request, _ httprouter.P
 	w.Header().Set("Content-Type", "application/json")
 	w.Header().Set("Access-Control-Allow-Origin", "*")
 
+	promServer := env.GetPrometheusServerEndpoint()
+
 	api := prometheusAPI.NewAPI(a.PrometheusClient)
 	result, err := api.Config(r.Context())
 	if err != nil {
-
-		fmt.Fprintf(w, "Using Prometheus at "+os.Getenv("PROMETHEUS_SERVER_ENDPOINT")+". Error: "+err.Error())
+		fmt.Fprintf(w, "Using Prometheus at "+promServer+". Error: "+err.Error())
 	} else {
 
-		fmt.Fprintf(w, "Using Prometheus at "+os.Getenv("PROMETHEUS_SERVER_ENDPOINT")+". PrometheusConfig: "+result.YAML)
+		fmt.Fprintf(w, "Using Prometheus at "+promServer+". PrometheusConfig: "+result.YAML)
 	}
 }
 
@@ -1441,12 +1458,24 @@ func Initialize(additionalConfigWatchers ...*watcher.ConfigMapWatcher) *Accesses
 		panic(err.Error())
 	}
 
+	// Create ConfigFileManager for synchronization of shared configuration
+	confManager := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		BucketStoreConfig: env.GetKubecostConfigBucket(),
+		LocalConfigPath:   "/",
+	})
+
 	// Create Kubernetes Cluster Cache + Watchers
-	k8sCache := clustercache.NewKubernetesClusterCache(kubeClientset)
+	var k8sCache clustercache.ClusterCache
+	if env.IsClusterCacheFileEnabled() {
+		importLocation := confManager.ConfigFileAt("/var/configs/cluster-cache.json")
+		k8sCache = clustercache.NewClusterImporter(importLocation)
+	} else {
+		k8sCache = clustercache.NewKubernetesClusterCache(kubeClientset)
+	}
 	k8sCache.Run()
 
 	cloudProviderKey := env.GetCloudProviderAPIKey()
-	cloudProvider, err := cloud.NewProvider(k8sCache, cloudProviderKey)
+	cloudProvider, err := cloud.NewProvider(k8sCache, cloudProviderKey, confManager)
 	if err != nil {
 		panic(err.Error())
 	}
@@ -1506,13 +1535,21 @@ func Initialize(additionalConfigWatchers ...*watcher.ConfigMapWatcher) *Accesses
 		}
 	}
 
+	// ClusterInfo Provider to provide the cluster map with local and remote cluster data
+	var clusterInfoProvider clusters.ClusterInfoProvider
+	if env.IsClusterInfoFileEnabled() {
+		clusterInfoFile := confManager.ConfigFileAt("/var/configs/cluster-info.json")
+		clusterInfoProvider = NewConfiguredClusterInfoProvider(clusterInfoFile)
+	} else {
+		clusterInfoProvider = NewLocalClusterInfoProvider(kubeClientset, cloudProvider)
+	}
+
 	// Initialize ClusterMap for maintaining ClusterInfo by ClusterID
 	var clusterMap clusters.ClusterMap
-	localCIProvider := NewLocalClusterInfoProvider(kubeClientset, cloudProvider)
 	if thanosClient != nil {
-		clusterMap = clusters.NewClusterMap(thanosClient, localCIProvider, 10*time.Minute)
+		clusterMap = clusters.NewClusterMap(thanosClient, clusterInfoProvider, 10*time.Minute)
 	} else {
-		clusterMap = clusters.NewClusterMap(promCli, localCIProvider, 5*time.Minute)
+		clusterMap = clusters.NewClusterMap(promCli, clusterInfoProvider, 5*time.Minute)
 	}
 
 	// cache responses from model and aggregation for a default of 10 minutes;
@@ -1541,24 +1578,27 @@ func Initialize(additionalConfigWatchers ...*watcher.ConfigMapWatcher) *Accesses
 		pc = promCli
 	}
 	costModel := NewCostModel(pc, cloudProvider, k8sCache, clusterMap, scrapeInterval)
-	metricsEmitter := NewCostModelMetricsEmitter(promCli, k8sCache, cloudProvider, costModel)
+	metricsEmitter := NewCostModelMetricsEmitter(promCli, k8sCache, cloudProvider, clusterInfoProvider, costModel)
 
 	a := &Accesses{
-		Router:            httprouter.New(),
-		PrometheusClient:  promCli,
-		ThanosClient:      thanosClient,
-		KubeClientSet:     kubeClientset,
-		ClusterMap:        clusterMap,
-		CloudProvider:     cloudProvider,
-		Model:             costModel,
-		MetricsEmitter:    metricsEmitter,
-		AggregateCache:    aggregateCache,
-		CostDataCache:     costDataCache,
-		ClusterCostsCache: clusterCostsCache,
-		OutOfClusterCache: outOfClusterCache,
-		SettingsCache:     settingsCache,
-		CacheExpiration:   cacheExpiration,
-		httpServices:      services.NewCostModelServices(),
+		Router:              httprouter.New(),
+		PrometheusClient:    promCli,
+		ThanosClient:        thanosClient,
+		KubeClientSet:       kubeClientset,
+		ClusterCache:        k8sCache,
+		ClusterMap:          clusterMap,
+		CloudProvider:       cloudProvider,
+		ConfigFileManager:   confManager,
+		ClusterInfoProvider: clusterInfoProvider,
+		Model:               costModel,
+		MetricsEmitter:      metricsEmitter,
+		AggregateCache:      aggregateCache,
+		CostDataCache:       costDataCache,
+		ClusterCostsCache:   clusterCostsCache,
+		OutOfClusterCache:   outOfClusterCache,
+		SettingsCache:       settingsCache,
+		CacheExpiration:     cacheExpiration,
+		httpServices:        services.NewCostModelServices(),
 	}
 	// Use the Accesses instance, itself, as the CostModelAggregator. This is
 	// confusing and unconventional, but necessary so that we can swap it

+ 44 - 0
pkg/env/costmodelenv.go

@@ -6,6 +6,7 @@ import (
 	"time"
 
 	"github.com/kubecost/cost-model/pkg/log"
+	"github.com/kubecost/cost-model/pkg/util/timeutil"
 )
 
 const (
@@ -75,8 +76,51 @@ const (
 
 	PricingConfigmapName  = "PRICING_CONFIGMAP_NAME"
 	KubecostJobNameEnvVar = "KUBECOST_JOB_NAME"
+
+	KubecostConfigBucketEnvVar    = "KUBECOST_CONFIG_BUCKET"
+	ClusterInfoFileEnabledEnvVar  = "CLUSTER_INFO_FILE_ENABLED"
+	ClusterCacheFileEnabledEnvVar = "CLUSTER_CACHE_FILE_ENABLED"
+	PrometheusQueryOffsetEnvVar   = "PROMETHEUS_QUERY_OFFSET"
 )
 
+// GetKubecostConfigBucket returns a file location for a mounted bucket configuration which is used to store
+// a subset of kubecost configurations that require sharing via remote storage.
+func GetKubecostConfigBucket() string {
+	return Get(KubecostConfigBucketEnvVar, "")
+}
+
+// IsClusterInfoFileEnabled returns true if the cluster info is read from a file or pulled from the local
+// cloud provider and kubernetes.
+func IsClusterInfoFileEnabled() bool {
+	return GetBool(ClusterInfoFileEnabledEnvVar, false)
+}
+
+// IsClusterCacheFileEnabled returns true if the kubernetes cluster data is read from a file or pulled from the local
+// kubernetes API.
+func IsClusterCacheFileEnabled() bool {
+	return GetBool(ClusterCacheFileEnabledEnvVar, false)
+}
+
+// GetPrometheusQueryOffset returns the time.Duration to offset all prometheus queries by. NOTE: This env var is applied
+// to all non-range queries made via our query context. This should only be applied when there is a significant delay in
+// data arriving in the target prom db. For example, if supplying a thanos or cortex querier for the prometheus server, using
+// a 3h offset will ensure that current time = current time - 3h.
+//
+// This offset is NOT the same as the GetThanosOffset() option, as that is only applied to queries made specifically targetting
+// thanos. This offset is applied globally.
+func GetPrometheusQueryOffset() time.Duration {
+	offset := Get(PrometheusQueryOffsetEnvVar, "")
+	if offset == "" {
+		return 0
+	}
+
+	dur, err := timeutil.ParseDuration(offset)
+	if err != nil {
+		return 0
+	}
+	return dur
+}
+
 func GetPricingConfigmapName() string {
 	return Get(PricingConfigmapName, "pricing-configs")
 }

+ 7 - 0
pkg/env/kubemetricsenv.go

@@ -3,6 +3,7 @@ package env
 const (
 	KubecostMetricsPodEnabledEnvVar = "KUBECOST_METRICS_POD_ENABLED"
 	KubecostMetricsPodPortEnvVar    = "KUBECOST_METRICS_PORT"
+	ExportClusterCacheEnabledEnvVar = "EXPORT_CLUSTER_CACHE_ENABLED"
 )
 
 func GetKubecostMetricsPort() int {
@@ -13,3 +14,9 @@ func GetKubecostMetricsPort() int {
 func IsKubecostMetricsPodEnabled() bool {
 	return GetBool(KubecostMetricsPodEnabledEnvVar, false)
 }
+
+// IsExportClusterCacheEnabled is set to true if the metrics pod should export the cluster cache
+// data to a target file location
+func IsExportClusterCacheEnabled() bool {
+	return GetBool(ExportClusterCacheEnabledEnvVar, false)
+}

+ 2 - 0
pkg/kubecost/window.go

@@ -8,6 +8,8 @@ import (
 	"strconv"
 	"time"
 
+	"github.com/kubecost/cost-model/pkg/util/timeutil"
+
 	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/thanos"
 	"github.com/kubecost/cost-model/pkg/util/timeutil"

+ 15 - 0
pkg/prom/query.go

@@ -8,6 +8,7 @@ import (
 	"strconv"
 	"time"
 
+	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/errors"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/util/httputil"
@@ -21,6 +22,10 @@ const (
 	epQueryRange = apiPrefix + "/query_range"
 )
 
+// prometheus query offset to apply to each non-range query
+// package scope to prevent calling duration parse each use
+var promQueryOffset time.Duration = env.GetPrometheusQueryOffset()
+
 // Context wraps a Prometheus client and provides methods for querying and
 // parsing query responses and errors.
 type Context struct {
@@ -171,6 +176,16 @@ func (ctx *Context) RawQuery(query string) ([]byte, error) {
 	u := ctx.Client.URL(epQuery, nil)
 	q := u.Query()
 	q.Set("query", query)
+
+	// for non-range queries, we set the timestamp for the query to time-offset
+	// this is a special use case that's typically only used when our primary
+	// prom db has delayed insertion (thanos, cortex, etc...)
+	if promQueryOffset != 0 && ctx.name != AllocationContextName {
+		q.Set("time", time.Now().Add(-promQueryOffset).UTC().Format(time.RFC3339))
+	} else {
+		q.Set("time", time.Now().UTC().Format(time.RFC3339))
+	}
+
 	u.RawQuery = q.Encode()
 
 	req, err := http.NewRequest(http.MethodPost, u.String(), nil)

+ 58 - 0
pkg/storage/bucketstorage.go

@@ -0,0 +1,58 @@
+package storage
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/pkg/errors"
+	"gopkg.in/yaml.v2"
+)
+
+// StorageProvider is the type of provider used for storage if not leveraging a file implementation.
+type StorageProvider string
+
+const (
+	S3 StorageProvider = "S3"
+	// AZURE StorageProvider = "AZURE"
+	// GCS   StorageProvider = "GCS"
+)
+
+// StorageConfig is the configuration type used as the "parent" configuration. It contains a type, which will
+// specify the bucket storage implementation, and a configuration object specific to that storage implementation.
+type StorageConfig struct {
+	Type   StorageProvider `yaml:"type"`
+	Config interface{}     `yaml:"config"`
+}
+
+// NewBucketStorage initializes and returns new Storage implementation leveraging the storage provider
+// configuration. This configuration type uses the layout provided in thanos: https://thanos.io/tip/thanos/storage.md/
+func NewBucketStorage(config []byte) (Storage, error) {
+	storageConfig := &StorageConfig{}
+	if err := yaml.UnmarshalStrict(config, storageConfig); err != nil {
+		return nil, errors.Wrap(err, "parsing config YAML file")
+	}
+
+	// Because the Config property is specific to the storage implementation, we'll marshal back into yaml, and allow
+	// the specific implementation to unmarshal back into a concrete configuration type.
+	config, err := yaml.Marshal(storageConfig.Config)
+	if err != nil {
+		return nil, errors.Wrap(err, "marshal content of storage configuration")
+	}
+
+	var storage Storage
+	switch strings.ToUpper(string(storageConfig.Type)) {
+	case string(S3):
+		storage, err = NewS3Storage(config)
+	//case string(GCS):
+	//	storage, err = NewGCSStorage(config)
+	//case string(AZURE):
+	//	storage, err = NewAzureStorage(config)
+	default:
+		return nil, errors.Errorf("storage with type %s is not supported", storageConfig.Type)
+	}
+	if err != nil {
+		return nil, errors.Wrap(err, fmt.Sprintf("create %s client", storageConfig.Type))
+	}
+
+	return storage, nil
+}

+ 139 - 0
pkg/storage/filestorage.go

@@ -0,0 +1,139 @@
+package storage
+
+import (
+	gofs "io/fs"
+	"io/ioutil"
+	"os"
+	gopath "path"
+	"path/filepath"
+
+	"github.com/kubecost/cost-model/pkg/util/fileutil"
+	"github.com/pkg/errors"
+)
+
+// FileStorage leverages the file system to write data to disk.
+type FileStorage struct {
+	baseDir string
+}
+
+// NewFileStorage returns a new storage API which leverages the file system.
+func NewFileStorage(baseDir string) Storage {
+	return &FileStorage{baseDir}
+}
+
+// FullPath returns the storage working path combined with the path provided
+func (fs *FileStorage) FullPath(path string) string {
+	return gopath.Join(fs.baseDir, path)
+}
+
+// Stat returns the StorageStats for the specific path.
+func (fs *FileStorage) Stat(path string) (*StorageInfo, error) {
+	f := gopath.Join(fs.baseDir, path)
+	st, err := os.Stat(f)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, DoesNotExistError
+		}
+
+		return nil, errors.Wrap(err, "Failed to stat file")
+	}
+
+	return FileToStorageInfo(st), nil
+}
+
+// List uses the relative path of the storage combined with the provided path to return
+// storage information for the files.
+func (fs *FileStorage) List(path string) ([]*StorageInfo, error) {
+	p := gopath.Join(fs.baseDir, path)
+
+	// Read files in the backup path
+	files, err := ioutil.ReadDir(p)
+	if err != nil {
+		return nil, err
+	}
+
+	return FilesToStorageInfo(files), nil
+}
+
+// Read uses the relative path of the storage combined with the provided path to
+// read the contents.
+func (fs *FileStorage) Read(path string) ([]byte, error) {
+	f := gopath.Join(fs.baseDir, path)
+
+	b, err := ioutil.ReadFile(f)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, DoesNotExistError
+		}
+		return nil, errors.Wrap(err, "Failed to read file")
+	}
+
+	return b, nil
+}
+
+// Write uses the relative path of the storage combined with the provided path
+// to write a new file or overwrite an existing file.
+func (fs *FileStorage) Write(path string, data []byte) error {
+	f := fs.prepare(path)
+
+	err := ioutil.WriteFile(f, data, os.ModePerm)
+	if err != nil {
+		return errors.Wrap(err, "Failed to write file")
+	}
+
+	return nil
+}
+
+// Remove uses the relative path of the storage combined with the provided path to
+// remove a file from storage permanently.
+func (fs *FileStorage) Remove(path string) error {
+	f := gopath.Join(fs.baseDir, path)
+
+	err := os.Remove(f)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return DoesNotExistError
+		}
+
+		return errors.Wrap(err, "Failed to remove file")
+	}
+
+	return nil
+}
+
+// Exists uses the relative path of the storage combined with the provided path to
+// determine if the file exists.
+func (fs *FileStorage) Exists(path string) (bool, error) {
+	f := gopath.Join(fs.baseDir, path)
+	return fileutil.FileExists(f)
+}
+
+// prepare checks to see if the directory being written to should be created before writing
+// the file, and then returns the correct full path.
+func (fs *FileStorage) prepare(path string) string {
+	f := gopath.Join(fs.baseDir, path)
+	dir := filepath.Dir(f)
+	if _, e := os.Stat(dir); e != nil && os.IsNotExist(e) {
+		os.MkdirAll(dir, os.ModePerm)
+	}
+
+	return f
+}
+
+// FilesToStorageInfo maps a []fs.FileInfo to []*storage.StorageInfo
+func FilesToStorageInfo(fileInfo []gofs.FileInfo) []*StorageInfo {
+	var stats []*StorageInfo
+	for _, info := range fileInfo {
+		stats = append(stats, FileToStorageInfo(info))
+	}
+	return stats
+}
+
+// FileToStorageInfo maps a fs.FileInfo to *storage.StorageInfo
+func FileToStorageInfo(fileInfo gofs.FileInfo) *StorageInfo {
+	return &StorageInfo{
+		Name:    fileInfo.Name(),
+		Size:    fileInfo.Size(),
+		ModTime: fileInfo.ModTime(),
+	}
+}

+ 543 - 0
pkg/storage/s3storage.go

@@ -0,0 +1,543 @@
+// Fork from Thanos S3 Bucket support to reuse configuration options
+// Licensed under the Apache License 2.0
+// https://github.com/thanos-io/thanos/blob/main/pkg/objstore/s3/s3.go
+package storage
+
+import (
+	"bytes"
+	"context"
+	"crypto/tls"
+	"io/ioutil"
+	"net"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/log"
+
+	"github.com/minio/minio-go/v7"
+	"github.com/minio/minio-go/v7/pkg/credentials"
+	"github.com/minio/minio-go/v7/pkg/encrypt"
+	"github.com/pkg/errors"
+
+	"gopkg.in/yaml.v2"
+)
+
+type ctxKey int
+
+const (
+	// DirDelim is the delimiter used to model a directory structure in an object store bucket.
+	DirDelim = "/"
+
+	// SSEKMS is the name of the SSE-KMS method for objectstore encryption.
+	SSEKMS = "SSE-KMS"
+
+	// SSEC is the name of the SSE-C method for objstore encryption.
+	SSEC = "SSE-C"
+
+	// SSES3 is the name of the SSE-S3 method for objstore encryption.
+	SSES3 = "SSE-S3"
+
+	// sseConfigKey is the context key to override SSE config. This feature is used by downstream
+	// projects (eg. Cortex) to inject custom SSE config on a per-request basis. Future work or
+	// refactoring can introduce breaking changes as far as the functionality is preserved.
+	// NOTE: we're using a context value only because it's a very specific S3 option. If SSE will
+	// be available to wider set of backends we should probably add a variadic option to Get() and Upload().
+	sseConfigKey = ctxKey(0)
+)
+
+var DefaultConfig = S3Config{
+	PutUserMetadata: map[string]string{},
+	HTTPConfig: HTTPConfig{
+		IdleConnTimeout:       time.Duration(90 * time.Second),
+		ResponseHeaderTimeout: time.Duration(2 * time.Minute),
+		TLSHandshakeTimeout:   time.Duration(10 * time.Second),
+		ExpectContinueTimeout: time.Duration(1 * time.Second),
+		MaxIdleConns:          100,
+		MaxIdleConnsPerHost:   100,
+		MaxConnsPerHost:       0,
+	},
+	PartSize: 1024 * 1024 * 64, // 64Ms3.
+}
+
+// Config stores the configuration for s3 bucket.
+type S3Config struct {
+	Bucket             string            `yaml:"bucket"`
+	Endpoint           string            `yaml:"endpoint"`
+	Region             string            `yaml:"region"`
+	AccessKey          string            `yaml:"access_key"`
+	Insecure           bool              `yaml:"insecure"`
+	SignatureV2        bool              `yaml:"signature_version2"`
+	SecretKey          string            `yaml:"secret_key"`
+	PutUserMetadata    map[string]string `yaml:"put_user_metadata"`
+	HTTPConfig         HTTPConfig        `yaml:"http_config"`
+	TraceConfig        TraceConfig       `yaml:"trace"`
+	ListObjectsVersion string            `yaml:"list_objects_version"`
+	// PartSize used for multipart upload. Only used if uploaded object size is known and larger than configured PartSize.
+	// NOTE we need to make sure this number does not produce more parts than 10 000.
+	PartSize  uint64    `yaml:"part_size"`
+	SSEConfig SSEConfig `yaml:"sse_config"`
+}
+
+// SSEConfig deals with the configuration of SSE for Minio. The following options are valid:
+// kmsencryptioncontext == https://docs.aws.amazon.com/kms/latest/developerguide/services-s3.html#s3-encryption-context
+type SSEConfig struct {
+	Type                 string            `yaml:"type"`
+	KMSKeyID             string            `yaml:"kms_key_id"`
+	KMSEncryptionContext map[string]string `yaml:"kms_encryption_context"`
+	EncryptionKey        string            `yaml:"encryption_key"`
+}
+
+type TraceConfig struct {
+	Enable bool `yaml:"enable"`
+}
+
+// HTTPConfig stores the http.Transport configuration for the s3 minio client.
+type HTTPConfig struct {
+	IdleConnTimeout       time.Duration `yaml:"idle_conn_timeout"`
+	ResponseHeaderTimeout time.Duration `yaml:"response_header_timeout"`
+	InsecureSkipVerify    bool          `yaml:"insecure_skip_verify"`
+
+	TLSHandshakeTimeout   time.Duration `yaml:"tls_handshake_timeout"`
+	ExpectContinueTimeout time.Duration `yaml:"expect_continue_timeout"`
+	MaxIdleConns          int           `yaml:"max_idle_conns"`
+	MaxIdleConnsPerHost   int           `yaml:"max_idle_conns_per_host"`
+	MaxConnsPerHost       int           `yaml:"max_conns_per_host"`
+
+	// Allow upstream callers to inject a round tripper
+	Transport http.RoundTripper `yaml:"-"`
+}
+
+// DefaultTransport - this default transport is based on the Minio
+// DefaultTransport up until the following commit:
+// https://githus3.com/minio/minio-go/commit/008c7aa71fc17e11bf980c209a4f8c4d687fc884
+// The values have since diverged.
+func DefaultTransport(config S3Config) *http.Transport {
+	return &http.Transport{
+		Proxy: http.ProxyFromEnvironment,
+		DialContext: (&net.Dialer{
+			Timeout:   30 * time.Second,
+			KeepAlive: 30 * time.Second,
+			DualStack: true,
+		}).DialContext,
+
+		MaxIdleConns:          config.HTTPConfig.MaxIdleConns,
+		MaxIdleConnsPerHost:   config.HTTPConfig.MaxIdleConnsPerHost,
+		IdleConnTimeout:       time.Duration(config.HTTPConfig.IdleConnTimeout),
+		MaxConnsPerHost:       config.HTTPConfig.MaxConnsPerHost,
+		TLSHandshakeTimeout:   time.Duration(config.HTTPConfig.TLSHandshakeTimeout),
+		ExpectContinueTimeout: time.Duration(config.HTTPConfig.ExpectContinueTimeout),
+		// A custom ResponseHeaderTimeout was introduced
+		// to cover cases where the tcp connection works but
+		// the server never answers. Defaults to 2 minutes.
+		ResponseHeaderTimeout: time.Duration(config.HTTPConfig.ResponseHeaderTimeout),
+		// Set this value so that the underlying transport round-tripper
+		// doesn't try to auto decode the body of objects with
+		// content-encoding set to `gzip`.
+		//
+		// Refer: https://golang.org/src/net/http/transport.go?h=roundTrip#L1843.
+		DisableCompression: true,
+		// #nosec It's up to the user to decide on TLS configs
+		TLSClientConfig: &tls.Config{InsecureSkipVerify: config.HTTPConfig.InsecureSkipVerify},
+	}
+}
+
+// S3Storage provides storage via S3
+type S3Storage struct {
+	name            string
+	client          *minio.Client
+	defaultSSE      encrypt.ServerSide
+	putUserMetadata map[string]string
+	partSize        uint64
+	listObjectsV1   bool
+}
+
+// parseConfig unmarshals a buffer into a Config with default HTTPConfig values.
+func parseConfig(conf []byte) (S3Config, error) {
+	config := DefaultConfig
+	if err := yaml.UnmarshalStrict(conf, &config); err != nil {
+		return S3Config{}, err
+	}
+
+	return config, nil
+}
+
+// NewBucket returns a new Bucket using the provided s3 config values.
+func NewS3Storage(conf []byte) (*S3Storage, error) {
+	log.Infof("Creating new S3 Storage...")
+
+	config, err := parseConfig(conf)
+	if err != nil {
+		return nil, err
+	}
+
+	return NewS3StorageWith(config)
+}
+
+// NewBucketWithConfig returns a new Bucket using the provided s3 config values.
+func NewS3StorageWith(config S3Config) (*S3Storage, error) {
+	var chain []credentials.Provider
+
+	log.Infof("New S3 Storage With Config: %+v", config)
+
+	wrapCredentialsProvider := func(p credentials.Provider) credentials.Provider { return p }
+	if config.SignatureV2 {
+		wrapCredentialsProvider = func(p credentials.Provider) credentials.Provider {
+			return &overrideSignerType{Provider: p, signerType: credentials.SignatureV2}
+		}
+	}
+
+	if err := validate(config); err != nil {
+		return nil, err
+	}
+	if config.AccessKey != "" {
+		chain = []credentials.Provider{wrapCredentialsProvider(&credentials.Static{
+			Value: credentials.Value{
+				AccessKeyID:     config.AccessKey,
+				SecretAccessKey: config.SecretKey,
+				SignerType:      credentials.SignatureV4,
+			},
+		})}
+	} else {
+		chain = []credentials.Provider{
+			wrapCredentialsProvider(&credentials.EnvAWS{}),
+			wrapCredentialsProvider(&credentials.FileAWSCredentials{}),
+			wrapCredentialsProvider(&credentials.IAM{
+				Client: &http.Client{
+					Transport: http.DefaultTransport,
+				},
+			}),
+		}
+	}
+
+	// Check if a roundtripper has been set in the config
+	// otherwise build the default transport.
+	var rt http.RoundTripper
+	if config.HTTPConfig.Transport != nil {
+		rt = config.HTTPConfig.Transport
+	} else {
+		rt = DefaultTransport(config)
+	}
+
+	client, err := minio.New(config.Endpoint, &minio.Options{
+		Creds:     credentials.NewChainCredentials(chain),
+		Secure:    !config.Insecure,
+		Region:    config.Region,
+		Transport: rt,
+	})
+	if err != nil {
+		return nil, errors.Wrap(err, "initialize s3 client")
+	}
+
+	var sse encrypt.ServerSide
+	if config.SSEConfig.Type != "" {
+		switch config.SSEConfig.Type {
+		case SSEKMS:
+			sse, err = encrypt.NewSSEKMS(config.SSEConfig.KMSKeyID, config.SSEConfig.KMSEncryptionContext)
+			if err != nil {
+				return nil, errors.Wrap(err, "initialize s3 client SSE-KMS")
+			}
+
+		case SSEC:
+			key, err := ioutil.ReadFile(config.SSEConfig.EncryptionKey)
+			if err != nil {
+				return nil, err
+			}
+
+			sse, err = encrypt.NewSSEC(key)
+			if err != nil {
+				return nil, errors.Wrap(err, "initialize s3 client SSE-C")
+			}
+
+		case SSES3:
+			sse = encrypt.NewSSE()
+
+		default:
+			sseErrMsg := errors.Errorf("Unsupported type %q was provided. Supported types are SSE-S3, SSE-KMS, SSE-C", config.SSEConfig.Type)
+			return nil, errors.Wrap(sseErrMsg, "Initialize s3 client SSE Config")
+		}
+	}
+
+	if config.ListObjectsVersion != "" && config.ListObjectsVersion != "v1" && config.ListObjectsVersion != "v2" {
+		return nil, errors.Errorf("Initialize s3 client list objects version: Unsupported version %q was provided. Supported values are v1, v2", config.ListObjectsVersion)
+	}
+
+	bkt := &S3Storage{
+		name:            config.Bucket,
+		client:          client,
+		defaultSSE:      sse,
+		putUserMetadata: config.PutUserMetadata,
+		partSize:        config.PartSize,
+		listObjectsV1:   config.ListObjectsVersion == "v1",
+	}
+	return bkt, nil
+}
+
+// Name returns the bucket name for s3.
+func (s3 *S3Storage) Name() string {
+	return s3.name
+}
+
+// validate checks to see the config options are set.
+func validate(conf S3Config) error {
+	if conf.Endpoint == "" {
+		return errors.New("no s3 endpoint in config file")
+	}
+
+	if conf.AccessKey == "" && conf.SecretKey != "" {
+		return errors.New("no s3 acccess_key specified while secret_key is present in config file; either both should be present in config or envvars/IAM should be used.")
+	}
+
+	if conf.AccessKey != "" && conf.SecretKey == "" {
+		return errors.New("no s3 secret_key specified while access_key is present in config file; either both should be present in config or envvars/IAM should be used.")
+	}
+
+	if conf.SSEConfig.Type == SSEC && conf.SSEConfig.EncryptionKey == "" {
+		return errors.New("encryption_key must be set if sse_config.type is set to 'SSE-C'")
+	}
+
+	if conf.SSEConfig.Type == SSEKMS && conf.SSEConfig.KMSKeyID == "" {
+		return errors.New("kms_key_id must be set if sse_config.type is set to 'SSE-KMS'")
+	}
+
+	return nil
+}
+
+// FullPath returns the storage working path combined with the path provided
+func (s3 *S3Storage) FullPath(name string) string {
+	name = s3.trimLeading(name)
+
+	return name
+}
+
+// Get returns a reader for the given object name.
+func (s3 *S3Storage) Read(name string) ([]byte, error) {
+	name = s3.trimLeading(name)
+
+	log.Infof("S3Storage::Read(%s)", name)
+	ctx := context.Background()
+
+	return s3.getRange(ctx, name, 0, -1)
+
+}
+
+// Exists checks if the given object exists.
+func (s3 *S3Storage) Exists(name string) (bool, error) {
+	name = s3.trimLeading(name)
+	//log.Infof("S3Storage::Exists(%s)", name)
+
+	ctx := context.Background()
+
+	_, err := s3.client.StatObject(ctx, s3.name, name, minio.StatObjectOptions{})
+	if err != nil {
+		if s3.isDoesNotExist(err) {
+			return false, nil
+		}
+		return false, errors.Wrap(err, "stat s3 object")
+	}
+
+	return true, nil
+}
+
+// Upload the contents of the reader as an object into the bucket.
+func (s3 *S3Storage) Write(name string, data []byte) error {
+	name = s3.trimLeading(name)
+
+	log.Infof("S3Storage::Write(%s)", name)
+
+	ctx := context.Background()
+	sse, err := s3.getServerSideEncryption(ctx)
+	if err != nil {
+		return err
+	}
+
+	var size int64 = int64(len(data))
+	var partSize uint64 = 0
+
+	r := bytes.NewReader(data)
+	_, err = s3.client.PutObject(ctx, s3.name, name, r, int64(size), minio.PutObjectOptions{
+		PartSize:             partSize,
+		ServerSideEncryption: sse,
+		UserMetadata:         s3.putUserMetadata,
+	})
+
+	if err != nil {
+		return errors.Wrap(err, "upload s3 object")
+	}
+
+	return nil
+}
+
+// Attributes returns information about the specified object.
+func (s3 *S3Storage) Stat(name string) (*StorageInfo, error) {
+	name = s3.trimLeading(name)
+
+	//log.Infof("S3Storage::Stat(%s)", name)
+	ctx := context.Background()
+
+	objInfo, err := s3.client.StatObject(ctx, s3.name, name, minio.StatObjectOptions{})
+	if err != nil {
+		if s3.isDoesNotExist(err) {
+			return nil, DoesNotExistError
+		}
+		return nil, err
+	}
+
+	return &StorageInfo{
+		Name:    s3.trimName(name),
+		Size:    objInfo.Size,
+		ModTime: objInfo.LastModified,
+	}, nil
+}
+
+// Delete removes the object with the given name.
+func (s3 *S3Storage) Remove(name string) error {
+	name = s3.trimLeading(name)
+
+	log.Infof("S3Storage::Remove(%s)", name)
+	ctx := context.Background()
+
+	return s3.client.RemoveObject(ctx, s3.name, name, minio.RemoveObjectOptions{})
+}
+
+func (s3 *S3Storage) List(path string) ([]*StorageInfo, error) {
+	path = s3.trimLeading(path)
+
+	log.Infof("S3Storage::List(%s)", path)
+	ctx := context.Background()
+
+	// Ensure the object name actually ends with a dir suffix. Otherwise we'll just iterate the
+	// object itself as one prefix item.
+	if path != "" {
+		path = strings.TrimSuffix(path, DirDelim) + DirDelim
+	}
+
+	opts := minio.ListObjectsOptions{
+		Prefix:    path,
+		Recursive: false,
+		UseV1:     s3.listObjectsV1,
+	}
+
+	var stats []*StorageInfo
+	for object := range s3.client.ListObjects(ctx, s3.name, opts) {
+		// Catch the error when failed to list objects.
+		if object.Err != nil {
+			return nil, object.Err
+		}
+		// This sometimes happens with empty buckets.
+		if object.Key == "" {
+			continue
+		}
+		// The s3 client can also return the directory itself in the ListObjects call above.
+		if object.Key == path {
+			continue
+		}
+
+		stats = append(stats, &StorageInfo{
+			Name:    s3.trimName(object.Key),
+			Size:    object.Size,
+			ModTime: object.LastModified,
+		})
+	}
+
+	return stats, nil
+}
+
+// trimLeading removes a leading / from the file name
+func (s3 *S3Storage) trimLeading(file string) string {
+	if len(file) == 0 {
+		return file
+	}
+
+	if file[0] == '/' {
+		return file[1:]
+	}
+	return file
+}
+
+// trimName removes the leading directory prefix
+func (s3 *S3Storage) trimName(file string) string {
+	slashIndex := strings.LastIndex(file, "/")
+	if slashIndex < 0 {
+		return file
+	}
+
+	name := file[slashIndex+1:]
+	return name
+}
+
+// getServerSideEncryption returns the SSE to use.
+func (s3 *S3Storage) getServerSideEncryption(ctx context.Context) (encrypt.ServerSide, error) {
+	if value := ctx.Value(sseConfigKey); value != nil {
+		if sse, ok := value.(encrypt.ServerSide); ok {
+			return sse, nil
+		}
+		return nil, errors.New("invalid SSE config override provided in the context")
+	}
+
+	return s3.defaultSSE, nil
+}
+
+// isDoesNotExist returns true if error means that object key is not found.
+func (s3 *S3Storage) isDoesNotExist(err error) bool {
+	return minio.ToErrorResponse(errors.Cause(err)).Code == "NoSuchKey"
+}
+
+// isObjNotFound returns true if the error means that the object was not found
+func (s3 *S3Storage) isObjNotFound(err error) bool {
+	return minio.ToErrorResponse(errors.Cause(err)).Code == "NotFoundObject"
+}
+
+func (s3 *S3Storage) getRange(ctx context.Context, name string, off, length int64) ([]byte, error) {
+	sse, err := s3.getServerSideEncryption(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	opts := &minio.GetObjectOptions{ServerSideEncryption: sse}
+	if length != -1 {
+		if err := opts.SetRange(off, off+length-1); err != nil {
+			return nil, err
+		}
+	} else if off > 0 {
+		if err := opts.SetRange(off, 0); err != nil {
+			return nil, err
+		}
+	}
+	r, err := s3.client.GetObject(ctx, s3.name, name, *opts)
+	if err != nil {
+		if s3.isObjNotFound(err) {
+			return nil, DoesNotExistError
+		}
+		return nil, err
+	}
+
+	// NotFoundObject error is revealed only after first Read. This does the initial GetRequest. Prefetch this here
+	// for convenience.
+	if _, err := r.Read(nil); err != nil {
+		r.Close()
+		if s3.isObjNotFound(err) {
+			return nil, DoesNotExistError
+		}
+
+		return nil, errors.Wrap(err, "Read from S3 failed")
+	}
+
+	return ioutil.ReadAll(r)
+}
+
+type overrideSignerType struct {
+	credentials.Provider
+	signerType credentials.SignatureType
+}
+
+func (s *overrideSignerType) Retrieve() (credentials.Value, error) {
+	v, err := s.Provider.Retrieve()
+	if err != nil {
+		return v, err
+	}
+	if !v.SignerType.IsAnonymous() {
+		v.SignerType = s.signerType
+	}
+	return v, nil
+}

+ 55 - 0
pkg/storage/storage.go

@@ -0,0 +1,55 @@
+package storage
+
+import (
+	"os"
+	"time"
+)
+
+// DoesNotExistError is used as a generic error to return when a target path does not
+// exist in storage. Equivalent to os.ErrorNotExist such that it will work with os.IsNotExist(err)
+var DoesNotExistError = os.ErrNotExist
+
+// StorageInfo is a data object containing basic information about the path in storage.
+type StorageInfo struct {
+	Name    string    // base name of the file
+	Size    int64     // length in bytes for regular files
+	ModTime time.Time // modification time
+}
+
+// Storage provides an API for storing binary data
+type Storage interface {
+	// FullPath returns the storage working path combined with the path provided
+	FullPath(path string) string
+
+	// Stat returns the StorageStats for the specific path.
+	Stat(path string) (*StorageInfo, error)
+
+	// Read uses the relative path of the storage combined with the provided path to
+	// read the contents.
+	Read(path string) ([]byte, error)
+
+	// Write uses the relative path of the storage combined with the provided path
+	// to write a new file or overwrite an existing file.
+	Write(path string, data []byte) error
+
+	// Remove uses the relative path of the storage combined with the provided path to
+	// remove a file from storage permanently.
+	Remove(path string) error
+
+	// Exists uses the relative path of the storage combined with the provided path to
+	// determine if the file exists.
+	Exists(path string) (bool, error)
+
+	// List uses the relative path of the storage combined with the provided path to return
+	// storage information for the files.
+	List(path string) ([]*StorageInfo, error)
+}
+
+// IsNotExist returns true if the error provided from a storage object is DoesNotExist
+func IsNotExist(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	return err.Error() == DoesNotExistError.Error()
+}

+ 87 - 0
pkg/util/atomic/atomicrunstate.go

@@ -0,0 +1,87 @@
+package atomic
+
+import (
+	"sync"
+)
+
+// AtomicRunState can be used to provide thread-safe start/stop functionality to internal run-loops
+// inside a goroutine.
+type AtomicRunState struct {
+	lock     sync.Mutex
+	stopping bool
+	stop     chan struct{}
+	reset    chan struct{}
+}
+
+// Start checks for an existing run state and returns false if the run state has already started. If
+// the run state has not started, then it will advance to the started state and return true.
+func (ars *AtomicRunState) Start() bool {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	if ars.stop != nil {
+		return false
+	}
+
+	ars.stop = make(chan struct{})
+	return true
+}
+
+// OnStop returns a channel that should be used within a select goroutine run loop. It is set to signal
+// whenever Stop() is executed. Once the channel is signaled, Reset() should be called if the runstate
+// is to be used again.
+func (ars *AtomicRunState) OnStop() <-chan struct{} {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	return ars.stop
+}
+
+// Stops closes the stop channel triggering any selects waiting for OnStop()
+func (ars *AtomicRunState) Stop() bool {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	if !ars.stopping && ars.stop != nil {
+		ars.stopping = true
+		ars.reset = make(chan struct{})
+		close(ars.stop)
+		return true
+	}
+
+	return false
+}
+
+// Reset should be called in the select case for OnStop(). Note that calling Reset() prior to
+// selecting OnStop() will result in failed Stop signal receive.
+func (ars *AtomicRunState) Reset() {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	close(ars.reset)
+	ars.stopping = false
+	ars.stop = nil
+}
+
+// IsRunning returns true if the state is running or in the process of stopping.
+func (ars *AtomicRunState) IsRunning() bool {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	return ars.stop != nil
+}
+
+// IsStopping returns true if the run state has been stopped, but not yet reset.
+func (ars *AtomicRunState) IsStopping() bool {
+	ars.lock.Lock()
+	defer ars.lock.Unlock()
+
+	return ars.stopping && ars.stop != nil
+}
+
+// WaitForStop will wait for a stop to occur IFF the run state is in the process of stopping.
+func (ars *AtomicRunState) WaitForReset() {
+	if ars.IsStopping() {
+		<-ars.reset
+	}
+}

+ 174 - 0
pkg/util/atomic/atomicrunstate_test.go

@@ -0,0 +1,174 @@
+package atomic
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+// NOTE: This test uses time.Sleep() in an attempt to specifically schedule concurrent actions for testing
+// NOTE: Testing concurrency is hard, so if there are inconsistent results, make sure it's not just the timing
+// NOTE: of the test on the testing hardware.
+func TestRunState(t *testing.T) {
+	t.Parallel()
+
+	var ars AtomicRunState
+
+	if !ars.Start() {
+		t.Fatalf("Failed to Start() AtomicRunState")
+	}
+
+	if ars.Start() {
+		t.Fatalf("Started AtomicRunState a second time")
+	}
+
+	success := make(chan bool)
+
+	go func() {
+		cycles := 0
+		for {
+			// Our test expects exactly 1 cycle, so if we exceed that, we fail!
+			if cycles >= 2 {
+				success <- false
+				return
+			}
+			// create a "work" time before the select
+			time.Sleep(1 * time.Second)
+
+			select {
+			case <-ars.OnStop():
+				t.Logf("Stopped\n")
+				ars.Reset()
+				success <- true
+				return
+			case <-time.After(2 * time.Second):
+				t.Logf("Tick\n")
+			}
+			cycles++
+		}
+	}()
+
+	// Wait for one full work cycle (3 seconds), attempt Stop during "work" phase
+	time.Sleep(3500 * time.Millisecond)
+	ars.Stop()
+
+	result := <-success
+	if !result {
+		t.Fatalf("Executed too many work cycles, expected 1 cycle")
+	}
+}
+
+// leaks goroutines potentially, so only use in testing!
+func waitChannelFor(wg *sync.WaitGroup) chan bool {
+	ch := make(chan bool)
+	go func() {
+		wg.Wait()
+		ch <- true
+	}()
+	return ch
+}
+
+func TestDoubleWait(t *testing.T) {
+	t.Parallel()
+
+	var ars AtomicRunState
+
+	ars.WaitForReset()
+
+	if !ars.Start() {
+		t.Fatalf("Failed to Start() AtomicRunState")
+	}
+
+	if ars.Start() {
+		t.Fatalf("Started AtomicRunState a second time")
+	}
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	go func() {
+		t.Logf("GoRoutine 1 Waiting....")
+		<-ars.OnStop()
+		wg.Done()
+	}()
+
+	go func() {
+		t.Logf("GoRoutine 2 Waiting....")
+		<-ars.OnStop()
+		wg.Done()
+	}()
+
+	time.Sleep(1 * time.Second)
+	ars.Stop()
+	select {
+	case <-time.After(time.Second):
+		t.Fatalf("Did not receive signal from both go routines after a second\n")
+		return
+	case <-waitChannelFor(&wg):
+		t.Logf("Received signals from both go routines\n")
+	}
+	ars.Reset()
+}
+
+func TestContinuousConcurrentStartsAndStops(t *testing.T) {
+	t.Parallel()
+
+	const cycles = 5
+
+	var ars AtomicRunState
+	started := make(chan bool)
+
+	var wg sync.WaitGroup
+	wg.Add(cycles)
+
+	// continuously try and start the ars on a tight loop
+	// throttled by OnStop and WaitForReset()
+	go func() {
+		defer func() {
+			if e := recover(); e != nil {
+				// sometimes the waitgroup will hit a negative value at the end of the test
+				// this is ok given the way the test behaves (chaos star/stop calls), so
+				// we can safely ignore.
+			}
+		}()
+
+		firstCycle := true
+		for {
+			ars.WaitForReset()
+			if ars.Start() {
+				t.Logf("Started")
+				if firstCycle {
+					firstCycle = false
+					started <- true
+				}
+				wg.Done()
+			}
+
+			<-ars.OnStop()
+			t.Logf("Stopped")
+		}
+	}()
+
+	// wait for an initial start
+	<-started
+
+	// Loop Stop/Resets from other goroutines
+	go func() {
+		for {
+			time.Sleep(100 * time.Millisecond)
+			if ars.Stop() {
+				<-ars.OnStop()
+				time.Sleep(500 * time.Millisecond)
+				ars.Reset()
+			}
+		}
+	}()
+
+	// Wait for full cycles
+	select {
+	case <-time.After(5 * time.Second):
+		t.Fatalf("Didn't complete %d cycles after 10 seconds", cycles)
+	case <-waitChannelFor(&wg):
+		t.Logf("Completed!")
+	}
+}

+ 139 - 0
pkg/util/atomic/example_intervalrunner_test.go

@@ -0,0 +1,139 @@
+package atomic_test
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/util/atomic"
+)
+
+// IntervalRunner is an example implementation of AtomicRunState.
+type IntervalRunner struct {
+	runState atomic.AtomicRunState
+	action   func()
+	interval time.Duration
+}
+
+// NewIntervalRunner Creates a new instance of an interval runner to execute the provided
+// function on a designated interval until explicitly stopped.
+func NewIntervalRunner(action func(), interval time.Duration) *IntervalRunner {
+	return &IntervalRunner{
+		action:   action,
+		interval: interval,
+	}
+}
+
+// Start begins the interval execution. It returns true if the interval execution successfully starts.
+// It will return false if the interval execcution is already running.
+func (ir *IntervalRunner) Start() bool {
+	// Before we attempt to start, we must ensure we are not in a stopping state, this is a common
+	// pattern that should be used with the AtomicRunState
+	ir.runState.WaitForReset()
+
+	// This will atomically check the current state to ensure we can run, then advances the state.
+	// If the state is already started, it will return false.
+	if !ir.runState.Start() {
+		return false
+	}
+
+	// our run state is advanced, let's execute our action on the interval
+	// spawn a new goroutine which will loop and wait the interval each iteration
+	go func() {
+		for {
+			// use a select statement to receive whichever channel receives data first
+			select {
+			// if our stop channel receives data, it means we have explicitly called
+			// Stop(), and must reset our AtomicRunState to it's initial idle state
+			case <-ir.runState.OnStop():
+				ir.runState.Reset()
+				return // exit go routine
+
+			// After our interval elapses, fall through
+			case <-time.After(ir.interval):
+			}
+
+			// Execute the function
+			ir.action()
+
+			// Loop back to the select where we will wait for the interval to elapse
+			// or an explicit stop to be called
+		}
+	}()
+
+	return true
+}
+
+// Stop will explicitly stop the execution of the interval runner. If an action is already executing, it will wait
+// until completion before processing the stop. Any attempts to start during the stopping phase will block until
+// it's possible to Start() again
+func (ir *IntervalRunner) Stop() bool {
+	return ir.runState.Stop()
+}
+
+func Example_intervalRunner() {
+	count := 0
+
+	// As a general test, we'll use a goroutine which waits for a specific number of
+	// ticks before calling stop, then issues a signal back to the main thread
+	var wg sync.WaitGroup
+	wg.Add(4)
+
+	// Create a new IntervalRunner instance to execute our print action every second
+	ir := NewIntervalRunner(
+		func() {
+			fmt.Printf("Tick[%d]\n", count)
+			count++
+			// advance the wait group count
+			wg.Done()
+		},
+		time.Second,
+	)
+
+	// Start the runner, panic on failure
+	if !ir.Start() {
+		panic("Failed to start interval runner!")
+	}
+
+	// spin up a second goroutine which will wait for a specific number of
+	// ticks before calling Stop(). This is a bit contrived, but demonstrates
+	// multiple goroutines controlling the same interval runner.
+	complete := make(chan bool)
+	go func() {
+		wg.Wait()
+
+		// Stop the interval runner, notify main thread
+		ir.Stop()
+		complete <- true
+	}()
+
+	<-complete
+
+	// Start immediately again using a different total tick count
+	count = 0
+	wg.Add(2)
+
+	// Start the runner, panic on failure
+	if !ir.Start() {
+		panic("Failed to start interval runner!")
+	}
+
+	// Create a new Stop waiter
+	go func() {
+		wg.Wait()
+
+		// Stop the interval runner, notify main thread
+		ir.Stop()
+		complete <- true
+	}()
+
+	<-complete
+
+	// Output:
+	// Tick[0]
+	// Tick[1]
+	// Tick[2]
+	// Tick[3]
+	// Tick[0]
+	// Tick[1]
+}

+ 34 - 8
test/cloud_test.go

@@ -10,6 +10,7 @@ import (
 
 	"github.com/kubecost/cost-model/pkg/cloud"
 	"github.com/kubecost/cost-model/pkg/clustercache"
+	"github.com/kubecost/cost-model/pkg/config"
 	"github.com/kubecost/cost-model/pkg/costmodel"
 	"github.com/kubecost/cost-model/pkg/costmodel/clusters"
 
@@ -97,6 +98,10 @@ func TestNodePriceFromCSV(t *testing.T) {
 	nameWant := "gke-standard-cluster-1-pool-1-91dc432d-cg69"
 	labelFooWant := "labelfoo"
 
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	n := &v1.Node{}
 	n.Spec.ProviderID = providerIDWant
 	n.Name = nameWant
@@ -108,7 +113,7 @@ func TestNodePriceFromCSV(t *testing.T) {
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 	c.DownloadPricingData()
@@ -138,7 +143,7 @@ func TestNodePriceFromCSV(t *testing.T) {
 	c2 := &cloud.CSVProvider{
 		CSVLocation: "../configs/fake.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 	k3 := c.GetKey(n.Labels, n)
@@ -153,6 +158,10 @@ func TestNodePriceFromCSVWithRegion(t *testing.T) {
 	nameWant := "foo"
 	labelFooWant := "labelfoo"
 
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	n := &v1.Node{}
 	n.Spec.ProviderID = providerIDWant
 	n.Name = nameWant
@@ -180,7 +189,7 @@ func TestNodePriceFromCSVWithRegion(t *testing.T) {
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_region.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 	c.DownloadPricingData()
@@ -230,7 +239,7 @@ func TestNodePriceFromCSVWithRegion(t *testing.T) {
 	c2 := &cloud.CSVProvider{
 		CSVLocation: "../configs/fake.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 	k5 := c.GetKey(n.Labels, n)
@@ -265,10 +274,14 @@ type FakeClusterMap struct {
 
 func TestNodePriceFromCSVWithBadConfig(t *testing.T) {
 	os.Setenv("CONFIG_PATH", "../config")
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_case.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("invalid.json"),
+			Config: cloud.NewProviderConfig(confMan, "invalid.json"),
 		},
 	}
 	c.DownloadPricingData()
@@ -294,10 +307,15 @@ func TestNodePriceFromCSVWithBadConfig(t *testing.T) {
 
 func TestSourceMatchesFromCSV(t *testing.T) {
 	os.Setenv("CONFIG_PATH", "../configs")
+
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_case.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "/default.json"),
 		},
 	}
 	c.DownloadPricingData()
@@ -369,10 +387,14 @@ func TestNodePriceFromCSVWithCase(t *testing.T) {
 	n.Labels[v1.LabelZoneRegion] = "eastus2"
 	wantPrice := "0.13370357"
 
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_case.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}
 
@@ -399,10 +421,14 @@ func TestNodePriceFromCSVByClass(t *testing.T) {
 	wantpricefloat := 0.13370357
 	wantPrice := fmt.Sprintf("%f", (math.Round(wantpricefloat*1000000) / 1000000))
 
+	confMan := config.NewConfigFileManager(&config.ConfigFileManagerOpts{
+		LocalConfigPath: "./",
+	})
+
 	c := &cloud.CSVProvider{
 		CSVLocation: "../configs/pricing_schema_case.csv",
 		CustomProvider: &cloud.CustomProvider{
-			Config: cloud.NewProviderConfig("../configs/default.json"),
+			Config: cloud.NewProviderConfig(confMan, "../configs/default.json"),
 		},
 	}