Просмотр исходного кода

Hard Limit Max Wait to 10s, Add More Tests

Matt Bolt 4 лет назад
Родитель
Сommit
1645d719ea
3 измененных файлов с 69 добавлено и 3 удалено
  1. 1 1
      pkg/env/costmodelenv.go
  2. 12 0
      pkg/prom/prom.go
  3. 56 2
      pkg/prom/ratelimitedclient_test.go

+ 1 - 1
pkg/env/costmodelenv.go

@@ -107,7 +107,7 @@ func IsClusterCacheFileEnabled() bool {
 // IsPrometheusRetryOnRateLimitResponse will attempt to retry if a 429 response is received OR a 400 with a body containing
 // ThrottleException (common in AWS services like AMP)
 func IsPrometheusRetryOnRateLimitResponse() bool {
-	return GetBool(PrometheusRetryOnRateLimitResponseEnvVar, false)
+	return GetBool(PrometheusRetryOnRateLimitResponseEnvVar, true)
 }
 
 // GetPrometheusQueryOffset returns the time.Duration to offset all prometheus queries by. NOTE: This env var is applied

+ 12 - 0
pkg/prom/prom.go

@@ -61,6 +61,11 @@ func (auth *ClientAuth) Apply(req *http.Request) {
 //  Rate Limited Error
 //--------------------------------------------------------------------------
 
+// MaxRetryAfterDuration is the maximum amount of time we should ever wait
+// during a retry. This is to prevent starvation on the request threads
+const MaxRetryAfterDuration = 10 * time.Second
+
+// RateLimitResponseStatus contains the status of the rate limited retries
 type RateLimitResponseStatus struct {
 	RetriesRemaining int
 	WaitTime         time.Duration
@@ -267,6 +272,13 @@ func (rlpc *RateLimitedPrometheusClient) worker() {
 					status = append(status, &RateLimitResponseStatus{RetriesRemaining: retries, WaitTime: retryAfter})
 					log.DedupedInfof(50, "Rate Limited Prometheus Request. Waiting for: %.2f seconds. Retries Remaining: %d", retryAfter.Seconds(), retries)
 
+					// To prevent total starvation of request threads, hard limit wait time to 10s. We also want quota limits/throttles
+					// to eventually pass through as an error. For example, if some quota is reached with 10 days left, we clearly
+					// don't want to block for 10 days.
+					if retryAfter > MaxRetryAfterDuration {
+						retryAfter = MaxRetryAfterDuration
+					}
+
 					// execute wait and retry
 					time.Sleep(retryAfter)
 					res, body, warnings, err = rlpc.client.Do(ctx, req)

+ 56 - 2
pkg/prom/ratelimitedclient_test.go

@@ -73,6 +73,19 @@ func newSuccessfulResponse() *ResponseAndBody {
 	}
 }
 
+// creates a ResponseAndBody representing a 400 status code
+func newFailureResponse() *ResponseAndBody {
+	body := []byte("Fail")
+
+	return &ResponseAndBody{
+		Response: &http.Response{
+			StatusCode: 400,
+			Body:       io.NopCloser(bytes.NewReader(body)),
+		},
+		Body: body,
+	}
+}
+
 // creates a ResponseAndBody representing a 429 status code and 'Retry-After' header
 func newNormalRateLimitedResponse(retryAfter string) *ResponseAndBody {
 	body := []byte("Rate Limitted")
@@ -143,6 +156,47 @@ func TestRateLimitedOnceAndSuccess(t *testing.T) {
 	}
 }
 
+func TestRateLimitedOnceAndFail(t *testing.T) {
+	t.Parallel()
+
+	// creates a prom client with hard coded responses for any requests that
+	// are issued
+	promClient := newMockPromClientWith([]*ResponseAndBody{
+		newNormalRateLimitedResponse("2"),
+		newFailureResponse(),
+	})
+
+	client, err := NewRateLimitedClient(
+		"TestClient",
+		promClient,
+		1,
+		nil,
+		nil,
+		true,
+		"",
+	)
+
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req, err := http.NewRequest(http.MethodPost, "", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// we just need to execute this  once to see retries in effect
+	res, body, _, err := client.Do(context.Background(), req)
+
+	if res.StatusCode != 400 {
+		t.Fatalf("400 StatusCode expected. Got: %d", res.StatusCode)
+	}
+
+	if string(body) != "Fail" {
+		t.Fatalf("Expected 'fail' message body. Got: %s", string(body))
+	}
+}
+
 func TestRateLimitedResponses(t *testing.T) {
 	t.Parallel()
 
@@ -233,9 +287,9 @@ func TestRateLimitedResponses(t *testing.T) {
 func TestConcurrentRateLimiting(t *testing.T) {
 	t.Parallel()
 
-	// Set QueryConcurrency to 3 here, then test double that
+	// Set QueryConcurrency to 3 here, then add a few for total requests
 	const QueryConcurrency = 3
-	const TotalRequests = QueryConcurrency * 2
+	const TotalRequests = QueryConcurrency + 2
 
 	dateRetry := time.Now().Add(5 * time.Second).Format(time.RFC1123)