Skip to content

Commit b280f42

Browse files
committed
receive: Add retry-after backoff with jitter via header field to active-series-limiting
By default, the active-series-limiting does only return a 429 when the tenant exceeded the limit. Prometheus does parse the retry-after header by default¹ and if the header is not set, it will do not set a default backoff on prometheus side. This results in a instant retry of sending the data again which may result in a potential increase of requests. ¹https://github.com/prometheus/prometheus/blob/bfbae39931a6ddeb5913a6ea8d48a34bbebc6d29/storage/remote/client.go#L309 Signed-off-by: roth-wine <[email protected]>
1 parent 49a560d commit b280f42

File tree

4 files changed

+20
-1
lines changed

4 files changed

+20
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
1313
### Fixed
1414

1515
### Added
16+
- [#8356](https://github.com/thanos-io/thanos/pull/8356): receive: Add retry-after backoff with jitter via header field to active-series-limiting
1617

1718
- [#8366](https://github.com/thanos-io/thanos/pull/8366) Store: optionally ignore Parquet migrated blocks
1819
- [#8359](https://github.com/thanos-io/thanos/pull/8359) Tools: add `--shipper.upload-compacted` flag for uploading compacted blocks to bucket upload-blocks

cmd/thanos/receive.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ func runReceive(
293293
ReplicationProtocol: receive.ReplicationProtocol(conf.replicationProtocol),
294294
OtlpEnableTargetInfo: conf.otlpEnableTargetInfo,
295295
OtlpResourceAttributes: conf.otlpResourceAttributes,
296+
RetryAfterBackoff: time.Duration(*conf.retryAfterBackoff),
297+
RetryAfterJitter: conf.retryAfterJitter,
296298
})
297299

298300
grpcProbe := prober.NewGRPC()
@@ -865,6 +867,8 @@ type receiveConfig struct {
865867
compression string
866868
replicationProtocol string
867869
grpcServiceConfig string
870+
retryAfterBackoff *model.Duration
871+
retryAfterJitter float64
868872

869873
tsdbMinBlockDuration *model.Duration
870874
tsdbMaxBlockDuration *model.Duration
@@ -1078,6 +1082,10 @@ func (rc *receiveConfig) registerFlag(cmd extkingpin.FlagClause) {
10781082

10791083
cmd.Flag("receive.lazy-retrieval-max-buffered-responses", "The lazy retrieval strategy can buffer up to this number of responses. This is to limit the memory usage. This flag takes effect only when the lazy retrieval strategy is enabled.").
10801084
Default("20").IntVar(&rc.lazyRetrievalMaxBufferedResponses)
1085+
1086+
rc.retryAfterBackoff = extkingpin.ModelDuration(cmd.Flag("receive.active-series-limiting.retry-after-backoff", "Backoff for retry-after header, set if active-series-limiting is enabled").Default("5s"))
1087+
1088+
cmd.Flag("receive.active-series-limiting.retry-after-jitter", "Jitter for retry-after backoff").Default("0.5").Float64Var(&rc.retryAfterJitter)
10811089
}
10821090

10831091
// determineMode returns the ReceiverMode that this receiver is configured to run in.

docs/components/receive.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ Please see the metric `thanos_receive_forward_delay_seconds` to see if you need
372372

373373
The following formula is used for calculating quorum:
374374

375-
```go mdox-exec="sed -n '1046,1056p' pkg/receive/handler.go"
375+
```go mdox-exec="sed -n '1051,1061p' pkg/receive/handler.go"
376376
// writeQuorum returns minimum number of replicas that has to confirm write success before claiming replication success.
377377
func (h *Handler) writeQuorum() int {
378378
// NOTE(GiedriusS): this is here because otherwise RF=2 doesn't make sense as all writes
@@ -670,5 +670,10 @@ Flags:
670670
this number of responses. This is to limit the
671671
memory usage. This flag takes effect only when
672672
the lazy retrieval strategy is enabled.
673+
--receive.active-series-limiting.retry-after-backoff=5s
674+
Backoff for retry-after header, set if
675+
active-series-limiting is enabled
676+
--receive.active-series-limiting.retry-after-jitter=0.5
677+
Jitter for retry-after backoff
673678
674679
```

pkg/receive/handler.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
"google.golang.org/grpc/codes"
4343
"google.golang.org/grpc/status"
4444

45+
"github.com/thanos-io/thanos/internal/cortex/util"
4546
"github.com/thanos-io/thanos/pkg/api"
4647
statusapi "github.com/thanos-io/thanos/pkg/api/status"
4748
"github.com/thanos-io/thanos/pkg/logging"
@@ -119,6 +120,8 @@ type Options struct {
119120
ReplicationProtocol ReplicationProtocol
120121
OtlpEnableTargetInfo bool
121122
OtlpResourceAttributes []string
123+
RetryAfterBackoff time.Duration
124+
RetryAfterJitter float64
122125
}
123126

124127
// Handler serves a Prometheus remote write receiving HTTP endpoint.
@@ -534,6 +537,8 @@ func (h *Handler) receiveHTTP(w http.ResponseWriter, r *http.Request) {
534537

535538
// Fail request fully if tenant has exceeded set limit.
536539
if !under {
540+
backoffDuration := util.DurationWithJitter(h.options.RetryAfterBackoff, h.options.RetryAfterJitter)
541+
w.Header().Add("Retry-After", time.Now().Add(backoffDuration).Format(http.TimeFormat))
537542
http.Error(w, "tenant is above active series limit", http.StatusTooManyRequests)
538543
return
539544
}

0 commit comments

Comments
 (0)