Pārlūkot izejas kodu

Handle Rate Limit Responses in Prom Client

Matt Bolt 4 gadi atpakaļ
vecāks
revīzija
505a9dc7dc

+ 16 - 2
pkg/cmd/agent/agent.go

@@ -85,8 +85,22 @@ func newPrometheusClient() (prometheus.Client, error) {
 
 	timeout := 120 * time.Second
 	keepAlive := 120 * time.Second
-
-	promCli, err := prom.NewPrometheusClient(address, timeout, keepAlive, queryConcurrency, "")
+	tlsHandshakeTimeout := 10 * time.Second
+
+	promCli, err := prom.NewPrometheusClient(address, &prom.PrometheusClientConfig{
+		Timeout:                  timeout,
+		KeepAlive:                keepAlive,
+		TLSHandshakeTimeout:      tlsHandshakeTimeout,
+		TLSInsecureSkipVerify:    env.GetInsecureSkipVerify(),
+		RetryOnRateLimitResponse: env.IsPrometheusRetryOnRateLimitResponse(),
+		Auth: &prom.ClientAuth{
+			Username:    env.GetDBBasicAuthUsername(),
+			Password:    env.GetDBBasicAuthUserPassword(),
+			BearerToken: env.GetDBBearerToken(),
+		},
+		QueryConcurrency: queryConcurrency,
+		QueryLogFile:     "",
+	})
 	if err != nil {
 		return nil, fmt.Errorf("Failed to create prometheus client, Error: %v", err)
 	}

+ 29 - 2
pkg/costmodel/router.go

@@ -1342,9 +1342,23 @@ func Initialize(additionalConfigWatchers ...*watcher.ConfigMapWatcher) *Accesses
 
 	timeout := 120 * time.Second
 	keepAlive := 120 * time.Second
+	tlsHandshakeTimeout := 10 * time.Second
 	scrapeInterval := time.Minute
 
-	promCli, err := prom.NewPrometheusClient(address, timeout, keepAlive, queryConcurrency, "")
+	promCli, err := prom.NewPrometheusClient(address, &prom.PrometheusClientConfig{
+		Timeout:                  timeout,
+		KeepAlive:                keepAlive,
+		TLSHandshakeTimeout:      tlsHandshakeTimeout,
+		TLSInsecureSkipVerify:    env.GetInsecureSkipVerify(),
+		RetryOnRateLimitResponse: env.IsPrometheusRetryOnRateLimitResponse(),
+		Auth: &prom.ClientAuth{
+			Username:    env.GetDBBasicAuthUsername(),
+			Password:    env.GetDBBasicAuthUserPassword(),
+			BearerToken: env.GetDBBearerToken(),
+		},
+		QueryConcurrency: queryConcurrency,
+		QueryLogFile:     "",
+	})
 	if err != nil {
 		klog.Fatalf("Failed to create prometheus client, Error: %v", err)
 	}
@@ -1452,7 +1466,20 @@ func Initialize(additionalConfigWatchers ...*watcher.ConfigMapWatcher) *Accesses
 		thanosAddress := thanos.QueryURL()
 
 		if thanosAddress != "" {
-			thanosCli, _ := thanos.NewThanosClient(thanosAddress, timeout, keepAlive, queryConcurrency, env.GetQueryLoggingFile())
+			thanosCli, _ := thanos.NewThanosClient(thanosAddress, &prom.PrometheusClientConfig{
+				Timeout:                  timeout,
+				KeepAlive:                keepAlive,
+				TLSHandshakeTimeout:      tlsHandshakeTimeout,
+				TLSInsecureSkipVerify:    env.GetInsecureSkipVerify(),
+				RetryOnRateLimitResponse: env.IsPrometheusRetryOnRateLimitResponse(),
+				Auth: &prom.ClientAuth{
+					Username:    env.GetMultiClusterBasicAuthUsername(),
+					Password:    env.GetMultiClusterBasicAuthPassword(),
+					BearerToken: env.GetMultiClusterBearerToken(),
+				},
+				QueryConcurrency: queryConcurrency,
+				QueryLogFile:     env.GetQueryLoggingFile(),
+			})
 
 			_, err = prom.Validate(thanosCli)
 			if err != nil {

+ 9 - 1
pkg/env/costmodelenv.go

@@ -81,7 +81,9 @@ const (
 	KubecostConfigBucketEnvVar    = "KUBECOST_CONFIG_BUCKET"
 	ClusterInfoFileEnabledEnvVar  = "CLUSTER_INFO_FILE_ENABLED"
 	ClusterCacheFileEnabledEnvVar = "CLUSTER_CACHE_FILE_ENABLED"
-	PrometheusQueryOffsetEnvVar   = "PROMETHEUS_QUERY_OFFSET"
+
+	PrometheusQueryOffsetEnvVar              = "PROMETHEUS_QUERY_OFFSET"
+	PrometheusRetryOnRateLimitResponseEnvVar = "PROMETHEUS_RETRY_ON_RATE_LIMIT"
 )
 
 // GetKubecostConfigBucket returns a file location for a mounted bucket configuration which is used to store
@@ -102,6 +104,12 @@ func IsClusterCacheFileEnabled() bool {
 	return GetBool(ClusterCacheFileEnabledEnvVar, false)
 }
 
+// IsPrometheusRetryOnRateLimitResponse will attempt to retry if a 429 response is received OR a 400 with a body containing
+// ThrottleException (common in AWS services like AMP)
+func IsPrometheusRetryOnRateLimitResponse() bool {
+	return GetBool(PrometheusRetryOnRateLimitResponseEnvVar, false)
+}
+
 // GetPrometheusQueryOffset returns the time.Duration to offset all prometheus queries by. NOTE: This env var is applied
 // to all non-range queries made via our query context. This should only be applied when there is a significant delay in
 // data arriving in the target prom db. For example, if supplying a thanos or cortex querier for the prometheus server, using

+ 133 - 32
pkg/prom/prom.go

@@ -3,14 +3,15 @@ package prom
 import (
 	"context"
 	"crypto/tls"
+	"fmt"
 	"net"
 	"net/http"
 	"net/url"
 	"os"
+	"strings"
 	"time"
 
 	"github.com/kubecost/cost-model/pkg/collections"
-	"github.com/kubecost/cost-model/pkg/env"
 	"github.com/kubecost/cost-model/pkg/log"
 	"github.com/kubecost/cost-model/pkg/util/atomic"
 	"github.com/kubecost/cost-model/pkg/util/fileutil"
@@ -56,6 +57,42 @@ func (auth *ClientAuth) Apply(req *http.Request) {
 	}
 }
 
+//--------------------------------------------------------------------------
+//  Rate Limited Error
+//--------------------------------------------------------------------------
+
+type RateLimitResponseStatus struct {
+	RetriesRemaining int
+	WaitTime         time.Duration
+}
+
+// String creates a string representation of the rate limit status
+func (rtrs *RateLimitResponseStatus) String() string {
+	return fmt.Sprintf("Wait Time: %.2f seconds, Retries Remaining: %d", rtrs.WaitTime.Seconds(), rtrs.RetriesRemaining)
+}
+
+// RateLimitedError contains a list of retry statuses that occurred during
+// retries on a rate limited response
+type RateLimitedResponseError struct {
+	RateLimitStatus []*RateLimitResponseStatus
+}
+
+// Error returns a string representation of the error, including the rate limit
+// status reports
+func (rlre *RateLimitedResponseError) Error() string {
+	var sb strings.Builder
+
+	sb.WriteString("Request was Rate Limited and Retries Exhausted:\n")
+
+	for _, rls := range rlre.RateLimitStatus {
+		sb.WriteString(" * ")
+		sb.WriteString(rls.String())
+		sb.WriteString("\n")
+	}
+
+	return sb.String()
+}
+
 //--------------------------------------------------------------------------
 //  RateLimitedPrometheusClient
 //--------------------------------------------------------------------------
@@ -63,13 +100,14 @@ func (auth *ClientAuth) Apply(req *http.Request) {
 // RateLimitedPrometheusClient is a prometheus client which limits the total number of
 // concurrent outbound requests allowed at a given moment.
 type RateLimitedPrometheusClient struct {
-	id         string
-	client     prometheus.Client
-	auth       *ClientAuth
-	queue      collections.BlockingQueue
-	decorator  QueryParamsDecorator
-	outbound   *atomic.AtomicInt32
-	fileLogger *golog.Logger
+	id               string
+	client           prometheus.Client
+	auth             *ClientAuth
+	queue            collections.BlockingQueue
+	decorator        QueryParamsDecorator
+	retryOnRateLimit bool
+	outbound         *atomic.AtomicInt32
+	fileLogger       *golog.Logger
 }
 
 // requestCounter is used to determine if the prometheus client keeps track of
@@ -81,11 +119,14 @@ type requestCounter interface {
 
 // NewRateLimitedClient creates a prometheus client which limits the number of concurrent outbound
 // prometheus requests.
-func NewRateLimitedClient(id string, config prometheus.Config, maxConcurrency int, auth *ClientAuth, decorator QueryParamsDecorator, queryLogFile string) (prometheus.Client, error) {
-	c, err := prometheus.NewClient(config)
-	if err != nil {
-		return nil, err
-	}
+func NewRateLimitedClient(
+	id string,
+	client prometheus.Client,
+	maxConcurrency int,
+	auth *ClientAuth,
+	decorator QueryParamsDecorator,
+	retryOnRateLimit bool,
+	queryLogFile string) (prometheus.Client, error) {
 
 	queue := collections.NewBlockingQueue()
 	outbound := atomic.NewAtomicInt32(0)
@@ -105,14 +146,24 @@ func NewRateLimitedClient(id string, config prometheus.Config, maxConcurrency in
 		}
 	}
 
+	// default authentication
+	if auth == nil {
+		auth = &ClientAuth{
+			Username:    "",
+			Password:    "",
+			BearerToken: "",
+		}
+	}
+
 	rlpc := &RateLimitedPrometheusClient{
-		id:         id,
-		client:     c,
-		queue:      queue,
-		decorator:  decorator,
-		outbound:   outbound,
-		auth:       auth,
-		fileLogger: logger,
+		id:               id,
+		client:           client,
+		queue:            queue,
+		decorator:        decorator,
+		retryOnRateLimit: retryOnRateLimit,
+		outbound:         outbound,
+		auth:             auth,
+		fileLogger:       logger,
 	}
 
 	// Start concurrent request processing
@@ -168,6 +219,8 @@ type workResponse struct {
 
 // worker is used as a consumer goroutine to pull workRequest from the blocking queue and execute them
 func (rlpc *RateLimitedPrometheusClient) worker() {
+	retryRateLimit := rlpc.retryOnRateLimit
+
 	for {
 		// blocks until there is an item available
 		item := rlpc.queue.Dequeue()
@@ -198,6 +251,34 @@ func (rlpc *RateLimitedPrometheusClient) worker() {
 			roundTripStart := time.Now()
 			res, body, warnings, err := rlpc.client.Do(ctx, req)
 
+			// If retries on rate limited response is enabled:
+			// * Check for a 429 StatusCode OR 400 StatusCode and message containing "ThrottlingException"
+			// * Attempt to parse a Retry-After from response headers (common on 429)
+			// * If we couldn't determine how long to wait for a retry, use 1 second by default
+			if retryRateLimit {
+				var status []*RateLimitResponseStatus
+				var retries int = 5
+
+				for httputil.IsRateLimited(res, body) && retries > 0 {
+					retries--
+
+					// calculate amount of time to wait before retry, default to 1s
+					retryAfter := httputil.RateLimitedRetryFor(res, time.Second)
+					status = append(status, &RateLimitResponseStatus{RetriesRemaining: retries, WaitTime: retryAfter})
+					log.DedupedInfof(50, "Rate Limited Prometheus Request. Waiting for: %.2f seconds. Retries Remaining: %d", retryAfter.Seconds(), retries)
+
+					// execute wait and retry
+					time.Sleep(retryAfter)
+					res, body, warnings, err = rlpc.client.Do(ctx, req)
+				}
+
+				// if we've broken out of our retry loop and the resp is still rate limited,
+				// then let's generate a meaningful error to pass back
+				if retries == 0 && httputil.IsRateLimited(res, body) {
+					err = &RateLimitedResponseError{RateLimitStatus: status}
+				}
+			}
+
 			// Decrement outbound counter
 			rlpc.outbound.Decrement()
 			LogQueryRequest(rlpc.fileLogger, req, timeInQueue, time.Since(roundTripStart))
@@ -245,30 +326,50 @@ func (rlpc *RateLimitedPrometheusClient) Do(ctx context.Context, req *http.Reque
 //  Client Helpers
 //--------------------------------------------------------------------------
 
-func NewPrometheusClient(address string, timeout, keepAlive time.Duration, queryConcurrency int, queryLogFile string) (prometheus.Client, error) {
-	tlsConfig := &tls.Config{InsecureSkipVerify: env.GetInsecureSkipVerify()}
+// PrometheusClientConfig contains all configurable options for creating a new prometheus client
+type PrometheusClientConfig struct {
+	Timeout                  time.Duration
+	KeepAlive                time.Duration
+	TLSHandshakeTimeout      time.Duration
+	TLSInsecureSkipVerify    bool
+	RetryOnRateLimitResponse bool
+	Auth                     *ClientAuth
+	QueryConcurrency         int
+	QueryLogFile             string
+}
 
-	// may be necessary for long prometheus queries. TODO: make this configurable
+// NewPrometheusClient creates a new rate limited client which limits by outbound concurrent requests.
+func NewPrometheusClient(address string, config *PrometheusClientConfig) (prometheus.Client, error) {
+	// may be necessary for long prometheus queries
 	pc := prometheus.Config{
 		Address: address,
 		RoundTripper: &http.Transport{
 			Proxy: http.ProxyFromEnvironment,
 			DialContext: (&net.Dialer{
-				Timeout:   timeout,
-				KeepAlive: keepAlive,
+				Timeout:   config.Timeout,
+				KeepAlive: config.KeepAlive,
 			}).DialContext,
-			TLSHandshakeTimeout: 10 * time.Second,
-			TLSClientConfig:     tlsConfig,
+			TLSHandshakeTimeout: config.TLSHandshakeTimeout,
+			TLSClientConfig: &tls.Config{
+				InsecureSkipVerify: config.TLSInsecureSkipVerify,
+			},
 		},
 	}
 
-	auth := &ClientAuth{
-		Username:    env.GetDBBasicAuthUsername(),
-		Password:    env.GetDBBasicAuthUserPassword(),
-		BearerToken: env.GetDBBearerToken(),
+	client, err := prometheus.NewClient(pc)
+	if err != nil {
+		return nil, err
 	}
 
-	return NewRateLimitedClient(PrometheusClientID, pc, queryConcurrency, auth, nil, queryLogFile)
+	return NewRateLimitedClient(
+		PrometheusClientID,
+		client,
+		config.QueryConcurrency,
+		config.Auth,
+		nil,
+		config.RetryOnRateLimitResponse,
+		config.QueryLogFile,
+	)
 }
 
 // LogQueryRequest logs the query that was send to prom/thanos with the time in queue and total time after being sent

+ 296 - 0
pkg/prom/ratelimitedclient_test.go

@@ -0,0 +1,296 @@
+package prom
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"math"
+	"net/http"
+	"net/url"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/kubecost/cost-model/pkg/util"
+	prometheus "github.com/prometheus/client_golang/api"
+)
+
+// ResponseAndBody is just a test objet used to hold predefined responses
+// and response bodies
+type ResponseAndBody struct {
+	Response *http.Response
+	Body     []byte
+}
+
+// MockPromClient accepts a slice of responses and bodies to return on requests made.
+// It will cycle these responses linearly, then reset back to the first.
+// Also works with concurrent requests.
+type MockPromClient struct {
+	sync.Mutex
+	responses []*ResponseAndBody
+	current   int
+}
+
+// prometheus.Client URL()
+func (mpc *MockPromClient) URL(ep string, args map[string]string) *url.URL {
+	return nil
+}
+
+// prometheus.Client Do
+func (mpc *MockPromClient) Do(context.Context, *http.Request) (*http.Response, []byte, prometheus.Warnings, error) {
+	// fake latency
+	time.Sleep(250 * time.Millisecond)
+
+	mpc.Lock()
+	defer mpc.Unlock()
+	rnb := mpc.responses[mpc.current]
+	mpc.current++
+	if mpc.current >= len(mpc.responses) {
+		mpc.current = 0
+	}
+
+	return rnb.Response, rnb.Body, nil, nil
+}
+
+// Creates a new mock prometheus client
+func newMockPromClientWith(responses []*ResponseAndBody) prometheus.Client {
+	return &MockPromClient{
+		responses: responses,
+		current:   0,
+	}
+}
+
+// creates a ResponseAndBody representing a 200 status code
+func newSuccessfulResponse() *ResponseAndBody {
+	body := []byte("Success")
+
+	return &ResponseAndBody{
+		Response: &http.Response{
+			StatusCode: 200,
+			Body:       io.NopCloser(bytes.NewReader(body)),
+		},
+		Body: body,
+	}
+}
+
+// creates a ResponseAndBody representing a 429 status code and 'Retry-After' header
+func newNormalRateLimitedResponse(retryAfter string) *ResponseAndBody {
+	body := []byte("Rate Limitted")
+
+	return &ResponseAndBody{
+		Response: &http.Response{
+			StatusCode: 429,
+			Header: http.Header{
+				"Retry-After": []string{retryAfter},
+			},
+			Body: io.NopCloser(bytes.NewReader(body)),
+		},
+		Body: body,
+	}
+}
+
+// creates a ResponseAndBody representing some amazon services ThrottlingException 400 status
+func newHackyAmazonRateLimitedResponse() *ResponseAndBody {
+	body := []byte("<ThrottlingException>\n  <Message>Rate exceeded</Message>\n</ThrottlingException>\n")
+
+	return &ResponseAndBody{
+		Response: &http.Response{
+			StatusCode: 400,
+			Body:       io.NopCloser(bytes.NewReader(body)),
+		},
+		Body: body,
+	}
+}
+
+func TestRateLimitedOnceAndSuccess(t *testing.T) {
+	t.Parallel()
+
+	// creates a prom client with hard coded responses for any requests that
+	// are issued
+	promClient := newMockPromClientWith([]*ResponseAndBody{
+		newNormalRateLimitedResponse("2"),
+		newSuccessfulResponse(),
+	})
+
+	client, err := NewRateLimitedClient(
+		"TestClient",
+		promClient,
+		1,
+		nil,
+		nil,
+		true,
+		"",
+	)
+
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req, err := http.NewRequest(http.MethodPost, "", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// we just need to execute this  once to see retries in effect
+	res, body, _, err := client.Do(context.Background(), req)
+
+	if res.StatusCode != 200 {
+		t.Fatalf("200 StatusCode expected. Got: %d", res.StatusCode)
+	}
+
+	if string(body) != "Success" {
+		t.Fatalf("Expected 'Success' message body. Got: %s", string(body))
+	}
+}
+
+func TestRateLimitedResponses(t *testing.T) {
+	t.Parallel()
+
+	dateRetry := time.Now().Add(5 * time.Second).Format(time.RFC1123)
+
+	// creates a prom client with hard coded responses for any requests that
+	// are issued
+	promClient := newMockPromClientWith([]*ResponseAndBody{
+		newNormalRateLimitedResponse("2"),
+		newNormalRateLimitedResponse(dateRetry),
+		newHackyAmazonRateLimitedResponse(),
+		newHackyAmazonRateLimitedResponse(),
+		newNormalRateLimitedResponse("3"),
+	})
+
+	client, err := NewRateLimitedClient(
+		"TestClient",
+		promClient,
+		1,
+		nil,
+		nil,
+		true,
+		"",
+	)
+
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req, err := http.NewRequest(http.MethodPost, "", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// we just need to execute this  once to see retries in effect
+	_, _, _, err = client.Do(context.Background(), req)
+
+	if err == nil {
+		t.Fatal("Expected a RateLimitedResponseError. Err was nil.")
+	}
+
+	rateLimitErr, ok := err.(*RateLimitedResponseError)
+	if !ok {
+		t.Fatal("Expected a RateLimitedResponseError. Got unexpected type.")
+	}
+
+	t.Logf("%s\n", rateLimitErr.Error())
+
+	// RateLimitedResponseStatus checks just ensure that wait times were close configuration
+	rateLimitRetries := rateLimitErr.RateLimitStatus
+
+	if len(rateLimitRetries) != 5 {
+		t.Fatalf("Expected 5 retries. Got: %d", len(rateLimitRetries))
+	}
+
+	// check 2s wait after
+	seconds := rateLimitRetries[0].WaitTime.Seconds()
+	if !util.IsApproximately(seconds, 2.0) {
+		t.Fatalf("Expected 2.0 seconds. Got %.2f", seconds)
+	}
+
+	// check to see if fuzzed wait time for datetime parsing
+	seconds = rateLimitRetries[1].WaitTime.Seconds()
+	if math.Abs(seconds-2.0) > 3.0 {
+		t.Fatalf("Expected delta between 2s and resulting wait time to be within 3s. Seconds: %.2f, Delta: %.2f", seconds, math.Abs(seconds-2.0))
+	}
+
+	// check 1s wait
+	seconds = rateLimitRetries[2].WaitTime.Seconds()
+	if !util.IsApproximately(seconds, 1.0) {
+		t.Fatalf("Expected 1.0 seconds. Got %.2f", seconds)
+	}
+
+	// check 1s wait
+	seconds = rateLimitRetries[3].WaitTime.Seconds()
+	if !util.IsApproximately(seconds, 1.0) {
+		t.Fatalf("Expected 1.0 seconds. Got %.2f", seconds)
+	}
+
+	// check 3s wait
+	seconds = rateLimitRetries[4].WaitTime.Seconds()
+	if !util.IsApproximately(seconds, 3.0) {
+		t.Fatalf("Expected 3.0 seconds. Got %.2f", seconds)
+	}
+
+}
+
+func TestConcurrentRateLimiting(t *testing.T) {
+	t.Parallel()
+
+	// Set QueryConcurrency to 3 here, then test double that
+	const QueryConcurrency = 3
+	const TotalRequests = QueryConcurrency * 2
+
+	dateRetry := time.Now().Add(5 * time.Second).Format(time.RFC1123)
+
+	// creates a prom client with hard coded responses for any requests that
+	// are issued
+	promClient := newMockPromClientWith([]*ResponseAndBody{
+		newNormalRateLimitedResponse("2"),
+		newNormalRateLimitedResponse(dateRetry),
+		newHackyAmazonRateLimitedResponse(),
+		newHackyAmazonRateLimitedResponse(),
+		newNormalRateLimitedResponse("3"),
+	})
+
+	client, err := NewRateLimitedClient(
+		"TestClient",
+		promClient,
+		QueryConcurrency,
+		nil,
+		nil,
+		true,
+		"",
+	)
+
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	errs := make(chan error, TotalRequests)
+
+	for i := 0; i < TotalRequests; i++ {
+		go func() {
+			req, err := http.NewRequest(http.MethodPost, "", nil)
+			if err != nil {
+				errs <- err
+				return
+			}
+
+			// we just need to execute this  once to see retries in effect
+			_, _, _, err = client.Do(context.Background(), req)
+
+			errs <- err
+		}()
+	}
+
+	for i := 0; i < TotalRequests; i++ {
+		err := <-errs
+		if err == nil {
+			t.Fatal("Expected a RateLimitedResponseError. Err was nil.")
+		}
+
+		rateLimitErr, ok := err.(*RateLimitedResponseError)
+		if !ok {
+			t.Fatal("Expected a RateLimitedResponseError. Got unexpected type.")
+		}
+
+		t.Logf("%s\n", rateLimitErr.Error())
+	}
+}

+ 19 - 12
pkg/thanos/thanos.go

@@ -67,26 +67,25 @@ func QueryOffset() string {
 	return queryOffset
 }
 
-func NewThanosClient(address string, timeout, keepAlive time.Duration, queryConcurrency int, queryLogFile string) (prometheus.Client, error) {
-	tlsConfig := &tls.Config{InsecureSkipVerify: env.GetInsecureSkipVerify()}
-
+func NewThanosClient(address string, config *prom.PrometheusClientConfig) (prometheus.Client, error) {
 	tc := prometheus.Config{
 		Address: address,
 		RoundTripper: &http.Transport{
 			Proxy: http.ProxyFromEnvironment,
 			DialContext: (&net.Dialer{
-				Timeout:   timeout,
-				KeepAlive: keepAlive,
+				Timeout:   config.Timeout,
+				KeepAlive: config.KeepAlive,
 			}).DialContext,
-			TLSHandshakeTimeout: 10 * time.Second,
-			TLSClientConfig:     tlsConfig,
+			TLSHandshakeTimeout: config.TLSHandshakeTimeout,
+			TLSClientConfig: &tls.Config{
+				InsecureSkipVerify: config.TLSInsecureSkipVerify,
+			},
 		},
 	}
 
-	auth := &prom.ClientAuth{
-		Username:    env.GetMultiClusterBasicAuthUsername(),
-		Password:    env.GetMultiClusterBasicAuthPassword(),
-		BearerToken: env.GetMultiClusterBearerToken(),
+	client, err := prometheus.NewClient(tc)
+	if err != nil {
+		return nil, err
 	}
 
 	// max source resolution decorator
@@ -97,5 +96,13 @@ func NewThanosClient(address string, timeout, keepAlive time.Duration, queryConc
 		return queryParams
 	}
 
-	return prom.NewRateLimitedClient(prom.ThanosClientID, tc, queryConcurrency, auth, maxSourceDecorator, queryLogFile)
+	return prom.NewRateLimitedClient(
+		prom.ThanosClientID,
+		client,
+		config.QueryConcurrency,
+		config.Auth,
+		maxSourceDecorator,
+		config.RetryOnRateLimitResponse,
+		config.QueryLogFile,
+	)
 }

+ 57 - 0
pkg/util/httputil/httputil.go

@@ -5,7 +5,9 @@ import (
 	"fmt"
 	"net/http"
 	"net/url"
+	"strconv"
 	"strings"
+	"time"
 
 	"github.com/kubecost/cost-model/pkg/util/mapper"
 )
@@ -90,6 +92,61 @@ func SetQuery(r *http.Request, query string) *http.Request {
 //  Package Funcs
 //--------------------------------------------------------------------------
 
+// IsRateLimited accepts a response and body to determine if either indicate
+// a rate limited return
+func IsRateLimited(resp *http.Response, body []byte) bool {
+	return IsRateLimitedResponse(resp) || IsRateLimitedBody(resp, body)
+}
+
+// RateLimitedRetryFor returns the parsed Retry-After header relative to the
+// current time. If the Retry-After header does not exist, the defaultWait parameter
+// is returned.
+func RateLimitedRetryFor(resp *http.Response, defaultWait time.Duration) time.Duration {
+	if resp.Header == nil {
+		return defaultWait
+	}
+	// Retry-After is either the number of seconds to wait or a target datetime (RFC1123)
+	value := resp.Header.Get("Retry-After")
+	if value == "" {
+		return defaultWait
+	}
+
+	seconds, err := strconv.ParseInt(value, 10, 64)
+	if err == nil {
+		return time.Duration(seconds) * time.Second
+	}
+
+	// failed to parse an integer, try datetime RFC1123
+	t, err := time.Parse(time.RFC1123, value)
+	if err == nil {
+		// return 0 if the datetime has already elapsed
+		result := t.Sub(time.Now())
+		if result < 0 {
+			return 0
+		}
+		return result
+	}
+
+	// failed to parse datetime, return default
+	return defaultWait
+}
+
+// IsRateLimitedResponse returns true if the status code is a 429 (TooManyRequests)
+func IsRateLimitedResponse(resp *http.Response) bool {
+	return resp.StatusCode == http.StatusTooManyRequests
+}
+
+// IsRateLimitedBody attempts to determine if a response body indicates throttling
+// has occurred. This function is a result of some API providers (AWS) returning
+// a 400 status code instead of 429 for rate limit exceptions.
+func IsRateLimitedBody(resp *http.Response, body []byte) bool {
+	// ignore non-400 status
+	if resp.StatusCode < http.StatusBadRequest || resp.StatusCode >= http.StatusInternalServerError {
+		return false
+	}
+	return strings.Contains(string(body), "ThrottlingException")
+}
+
 // HeaderString writes the request/response http.Header to a string.
 func HeaderString(h http.Header) string {
 	var sb strings.Builder