4 лет назад · 1645d719ea
--- a/pkg/env/costmodelenv.go
+++ b/pkg/env/costmodelenv.go
@@ -107,7 +107,7 @@ func IsClusterCacheFileEnabled() bool {
 
				 // IsPrometheusRetryOnRateLimitResponse will attempt to retry if a 429 response is received OR a 400 with a body containing
			
 
				 // ThrottleException (common in AWS services like AMP)
			
 
				 func IsPrometheusRetryOnRateLimitResponse() bool {
			
 
				-	return GetBool(PrometheusRetryOnRateLimitResponseEnvVar, false)
			
 
				+	return GetBool(PrometheusRetryOnRateLimitResponseEnvVar, true)
			
 
				 }
			
 
				 
			
 
				 // GetPrometheusQueryOffset returns the time.Duration to offset all prometheus queries by. NOTE: This env var is applied
			
--- a/pkg/prom/prom.go
+++ b/pkg/prom/prom.go
@@ -61,6 +61,11 @@ func (auth *ClientAuth) Apply(req *http.Request) {
 
				 //  Rate Limited Error
			
 
				 //--------------------------------------------------------------------------
			
 
				 
			
 
				+// MaxRetryAfterDuration is the maximum amount of time we should ever wait
			
 
				+// during a retry. This is to prevent starvation on the request threads
			
 
				+const MaxRetryAfterDuration = 10 * time.Second
			
 
				+
			
 
				+// RateLimitResponseStatus contains the status of the rate limited retries
			
 
				 type RateLimitResponseStatus struct {
			
 
				 	RetriesRemaining int
			
 
				 	WaitTime         time.Duration
			
@@ -267,6 +272,13 @@ func (rlpc *RateLimitedPrometheusClient) worker() {
 
				 					status = append(status, &RateLimitResponseStatus{RetriesRemaining: retries, WaitTime: retryAfter})
			
 
				 					log.DedupedInfof(50, "Rate Limited Prometheus Request. Waiting for: %.2f seconds. Retries Remaining: %d", retryAfter.Seconds(), retries)
			
 
				 
			
 
				+					// To prevent total starvation of request threads, hard limit wait time to 10s. We also want quota limits/throttles
			
 
				+					// to eventually pass through as an error. For example, if some quota is reached with 10 days left, we clearly
			
 
				+					// don't want to block for 10 days.
			
 
				+					if retryAfter > MaxRetryAfterDuration {
			
 
				+						retryAfter = MaxRetryAfterDuration
			
 
				+					}
			
 
				+
			
 
				 					// execute wait and retry
			
 
				 					time.Sleep(retryAfter)
			
 
				 					res, body, warnings, err = rlpc.client.Do(ctx, req)
			
--- a/pkg/prom/ratelimitedclient_test.go
+++ b/pkg/prom/ratelimitedclient_test.go
@@ -73,6 +73,19 @@ func newSuccessfulResponse() *ResponseAndBody {
 
				 	}
			
 
				 }
			
 
				 
			
 
				+// creates a ResponseAndBody representing a 400 status code
			
 
				+func newFailureResponse() *ResponseAndBody {
			
 
				+	body := []byte("Fail")
			
 
				+
			
 
				+	return &ResponseAndBody{
			
 
				+		Response: &http.Response{
			
 
				+			StatusCode: 400,
			
 
				+			Body:       io.NopCloser(bytes.NewReader(body)),
			
 
				+		},
			
 
				+		Body: body,
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 // creates a ResponseAndBody representing a 429 status code and 'Retry-After' header
			
 
				 func newNormalRateLimitedResponse(retryAfter string) *ResponseAndBody {
			
 
				 	body := []byte("Rate Limitted")
			
@@ -143,6 +156,47 @@ func TestRateLimitedOnceAndSuccess(t *testing.T) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				+func TestRateLimitedOnceAndFail(t *testing.T) {
			
 
				+	t.Parallel()
			
 
				+
			
 
				+	// creates a prom client with hard coded responses for any requests that
			
 
				+	// are issued
			
 
				+	promClient := newMockPromClientWith([]*ResponseAndBody{
			
 
				+		newNormalRateLimitedResponse("2"),
			
 
				+		newFailureResponse(),
			
 
				+	})
			
 
				+
			
 
				+	client, err := NewRateLimitedClient(
			
 
				+		"TestClient",
			
 
				+		promClient,
			
 
				+		1,
			
 
				+		nil,
			
 
				+		nil,
			
 
				+		true,
			
 
				+		"",
			
 
				+	)
			
 
				+
			
 
				+	if err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+
			
 
				+	req, err := http.NewRequest(http.MethodPost, "", nil)
			
 
				+	if err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+
			
 
				+	// we just need to execute this  once to see retries in effect
			
 
				+	res, body, _, err := client.Do(context.Background(), req)
			
 
				+
			
 
				+	if res.StatusCode != 400 {
			
 
				+		t.Fatalf("400 StatusCode expected. Got: %d", res.StatusCode)
			
 
				+	}
			
 
				+
			
 
				+	if string(body) != "Fail" {
			
 
				+		t.Fatalf("Expected 'fail' message body. Got: %s", string(body))
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 func TestRateLimitedResponses(t *testing.T) {
			
 
				 	t.Parallel()
			
 
				 
			
@@ -233,9 +287,9 @@ func TestRateLimitedResponses(t *testing.T) {
 
				 func TestConcurrentRateLimiting(t *testing.T) {
			
 
				 	t.Parallel()
			
 
				 
			
 
				-	// Set QueryConcurrency to 3 here, then test double that
			
 
				+	// Set QueryConcurrency to 3 here, then add a few for total requests
			
 
				 	const QueryConcurrency = 3
			
 
				-	const TotalRequests = QueryConcurrency * 2
			
 
				+	const TotalRequests = QueryConcurrency + 2
			
 
				 
			
 
				 	dateRetry := time.Now().Add(5 * time.Second).Format(time.RFC1123)