Skip to content

Commit 7bf4d76

Browse files
committed
implement exponential backoff
The createTaskRun and createCustomRun now uses wait.ExponentialBackoff to retry the creation of a taskRun or customRun when certain errors occur, specifically webhook timeouts. The function isWebhookTimeout checks if an error is a mutating adminssion webhook timeout, by looking for HTTP 500 and the phrase "timeout" in the error message. If a webhook timeout is detected, the backoff loop will retry the creation up to a configured number of steps, with increasing delay between attempts. if the error is not a webhook timeout, the function will not retry and will return the error immediately. Errors that not webhook timeouts, e.g. HTTP 400 bad request, validation errors, etc. are not retried and will cause the taskRun creation to fail as expected. By default, the exponential backoff strategy is disabled. To enable this feature, set the `enable-wait-exponential-backoff` to `true` in feature-flags config map. When enabled, the controller will use an exponential backoff strategy to retry taskRun and customRun creation if it encounters transient errors such as admission webhook timeouts. This improves robustness against temporary webhook issues. If the feature flag is set to false, the controller will not retry and will fail immediately on such errors. Configuration for the backoff parameters (duration, factor, steps, etc) can be set in the wait-exponential-backoff config map. Signed-off-by: Priti Desai <[email protected]>
1 parent 962aaf0 commit 7bf4d76

17 files changed

+905
-45
lines changed

config/300-crds/300-pipelinerun.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2200,6 +2200,8 @@ spec:
22002200
enableStepActions:
22012201
description: EnableStepActions is a no-op flag since StepActions are stable
22022202
type: boolean
2203+
enableWaitExponentialBackoff:
2204+
type: boolean
22032205
enforceNonfalsifiability:
22042206
type: string
22052207
maxResultSize:
@@ -2613,6 +2615,8 @@ spec:
26132615
enableStepActions:
26142616
description: EnableStepActions is a no-op flag since StepActions are stable
26152617
type: boolean
2618+
enableWaitExponentialBackoff:
2619+
type: boolean
26162620
enforceNonfalsifiability:
26172621
type: string
26182622
maxResultSize:
@@ -2900,6 +2904,8 @@ spec:
29002904
enableStepActions:
29012905
description: EnableStepActions is a no-op flag since StepActions are stable
29022906
type: boolean
2907+
enableWaitExponentialBackoff:
2908+
type: boolean
29032909
enforceNonfalsifiability:
29042910
type: string
29052911
maxResultSize:
@@ -5120,6 +5126,8 @@ spec:
51205126
enableStepActions:
51215127
description: EnableStepActions is a no-op flag since StepActions are stable
51225128
type: boolean
5129+
enableWaitExponentialBackoff:
5130+
type: boolean
51235131
enforceNonfalsifiability:
51245132
type: string
51255133
maxResultSize:

config/300-crds/300-taskrun.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,6 +1776,8 @@ spec:
17761776
enableStepActions:
17771777
description: EnableStepActions is a no-op flag since StepActions are stable
17781778
type: boolean
1779+
enableWaitExponentialBackoff:
1780+
type: boolean
17791781
enforceNonfalsifiability:
17801782
type: string
17811783
maxResultSize:
@@ -2063,6 +2065,8 @@ spec:
20632065
enableStepActions:
20642066
description: EnableStepActions is a no-op flag since StepActions are stable
20652067
type: boolean
2068+
enableWaitExponentialBackoff:
2069+
type: boolean
20662070
enforceNonfalsifiability:
20672071
type: string
20682072
maxResultSize:
@@ -3767,6 +3771,8 @@ spec:
37673771
enableStepActions:
37683772
description: EnableStepActions is a no-op flag since StepActions are stable
37693773
type: boolean
3774+
enableWaitExponentialBackoff:
3775+
type: boolean
37703776
enforceNonfalsifiability:
37713777
type: string
37723778
maxResultSize:
@@ -4020,6 +4026,8 @@ spec:
40204026
enableStepActions:
40214027
description: EnableStepActions is a no-op flag since StepActions are stable
40224028
type: boolean
4029+
enableWaitExponentialBackoff:
4030+
type: boolean
40234031
enforceNonfalsifiability:
40244032
type: string
40254033
maxResultSize:

config/config-feature-flags.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,9 @@ data:
130130
enable-kubernetes-sidecar: "false"
131131
# Setting this flag to "false" will have no effect since StepActions are a stable feature
132132
enable-step-actions: "true"
133+
# Controls whether exponential backoff is enabled when creating TaskRuns or CustomRuns.
134+
# If set to "true", the controller will use exponential backoff when retrying failed create operations,
135+
# which can help mitigate issues caused by temporary API server or webhook unavailability.
136+
# If set to "false", exponential backoff will be disabled.
137+
# For advanced tuning of backoff parameters, update the 'wait-exponential-backoff' ConfigMap.
138+
enable-wait-exponential-backoff: "false"
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2025 The Tekton Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# This ConfigMap allows cluster operators to configure the exponential backoff
17+
# parameters used by Tekton Pipelines when retrying Kubernetes API operations,
18+
# such as creating TaskRuns or CustomRuns. Adjusting these values can help
19+
# tune retry behavior in response to webhook timeouts or transient errors.
20+
apiVersion: v1
21+
kind: ConfigMap
22+
metadata:
23+
name: wait-exponential-backoff
24+
namespace: tekton-pipelines
25+
labels:
26+
app.kubernetes.io/instance: default
27+
app.kubernetes.io/part-of: tekton-pipelines
28+
data:
29+
duration: "10s" # The initial duration before the first retry (Go duration string, e.g. "1s").
30+
factor: "2.0" # The factor by which the duration increases after each retry (should not be negative).
31+
jitter: "0.0" # Jitter factor (0.0 = no jitter, 0.2 = up to 20% random additional wait).
32+
steps: "5" # The number of times the duration may change (number of backoff steps).
33+
cap: "60s" # The maximum duration between retries (Go duration string, e.g. "30s").

pkg/apis/config/feature_flags.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,11 @@ const (
107107
EnableKubernetesSidecar = "enable-kubernetes-sidecar"
108108
// DefaultEnableKubernetesSidecar is the default value for EnableKubernetesSidecar
109109
DefaultEnableKubernetesSidecar = false
110+
// EnableWaitExponentialBackoff is the flag to enable exponential backoff strategy
111+
EnableWaitExponentialBackoff = "enable-wait-exponential-backoff"
112+
// DefaultEnableWaitExponentialBackoff is the default value for EnableWaitExponentialBackoff
113+
DefaultEnableWaitExponentialBackoff = false
114+
110115
// EnableStepActions is the flag to enable step actions (no-op since it's stable)
111116
EnableStepActions = "enable-step-actions"
112117

@@ -198,12 +203,13 @@ type FeatureFlags struct {
198203
Coschedule string `json:"coschedule,omitempty"`
199204
EnableCELInWhenExpression bool `json:"enableCELInWhenExpression,omitempty"`
200205
// EnableStepActions is a no-op flag since StepActions are stable
201-
EnableStepActions bool `json:"enableStepActions,omitempty"`
202-
EnableParamEnum bool `json:"enableParamEnum,omitempty"`
203-
EnableArtifacts bool `json:"enableArtifacts,omitempty"`
204-
DisableInlineSpec string `json:"disableInlineSpec,omitempty"`
205-
EnableConciseResolverSyntax bool `json:"enableConciseResolverSyntax,omitempty"`
206-
EnableKubernetesSidecar bool `json:"enableKubernetesSidecar,omitempty"`
206+
EnableStepActions bool `json:"enableStepActions,omitempty"`
207+
EnableParamEnum bool `json:"enableParamEnum,omitempty"`
208+
EnableArtifacts bool `json:"enableArtifacts,omitempty"`
209+
DisableInlineSpec string `json:"disableInlineSpec,omitempty"`
210+
EnableConciseResolverSyntax bool `json:"enableConciseResolverSyntax,omitempty"`
211+
EnableKubernetesSidecar bool `json:"enableKubernetesSidecar,omitempty"`
212+
EnableWaitExponentialBackoff bool `json:"enableWaitExponentialBackoff,omitempty"`
207213
}
208214

209215
// GetFeatureFlagsConfigName returns the name of the configmap containing all
@@ -308,6 +314,9 @@ func NewFeatureFlagsFromMap(cfgMap map[string]string) (*FeatureFlags, error) {
308314
if err := setFeature(EnableKubernetesSidecar, DefaultEnableKubernetesSidecar, &tc.EnableKubernetesSidecar); err != nil {
309315
return nil, err
310316
}
317+
if err := setFeature(EnableWaitExponentialBackoff, DefaultEnableWaitExponentialBackoff, &tc.EnableWaitExponentialBackoff); err != nil {
318+
return nil, err
319+
}
311320

312321
return &tc, nil
313322
}

pkg/apis/config/store.go

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@ type cfgKey struct{}
3030
// Config holds the collection of configurations that we attach to contexts.
3131
// +k8s:deepcopy-gen=false
3232
type Config struct {
33-
Defaults *Defaults
34-
FeatureFlags *FeatureFlags
35-
Metrics *Metrics
36-
SpireConfig *sc.SpireConfig
37-
Events *Events
38-
Tracing *Tracing
33+
Defaults *Defaults
34+
FeatureFlags *FeatureFlags
35+
Metrics *Metrics
36+
SpireConfig *sc.SpireConfig
37+
Events *Events
38+
Tracing *Tracing
39+
WaitExponentialBackoff *WaitExponentialBackoff
3940
}
4041

4142
// FromContext extracts a Config from the provided context.
@@ -55,12 +56,13 @@ func FromContextOrDefaults(ctx context.Context) *Config {
5556
}
5657

5758
return &Config{
58-
Defaults: DefaultConfig.DeepCopy(),
59-
FeatureFlags: DefaultFeatureFlags.DeepCopy(),
60-
Metrics: DefaultMetrics.DeepCopy(),
61-
SpireConfig: DefaultSpire.DeepCopy(),
62-
Events: DefaultEvents.DeepCopy(),
63-
Tracing: DefaultTracing.DeepCopy(),
59+
Defaults: DefaultConfig.DeepCopy(),
60+
FeatureFlags: DefaultFeatureFlags.DeepCopy(),
61+
Metrics: DefaultMetrics.DeepCopy(),
62+
SpireConfig: DefaultSpire.DeepCopy(),
63+
Events: DefaultEvents.DeepCopy(),
64+
Tracing: DefaultTracing.DeepCopy(),
65+
WaitExponentialBackoff: DefaultWaitExponentialBackoff.DeepCopy(),
6466
}
6567
}
6668

@@ -83,12 +85,13 @@ func NewStore(logger configmap.Logger, onAfterStore ...func(name string, value i
8385
"defaults/features/artifacts",
8486
logger,
8587
configmap.Constructors{
86-
GetDefaultsConfigName(): NewDefaultsFromConfigMap,
87-
GetFeatureFlagsConfigName(): NewFeatureFlagsFromConfigMap,
88-
GetMetricsConfigName(): NewMetricsFromConfigMap,
89-
GetSpireConfigName(): NewSpireConfigFromConfigMap,
90-
GetEventsConfigName(): NewEventsFromConfigMap,
91-
GetTracingConfigName(): NewTracingFromConfigMap,
88+
GetDefaultsConfigName(): NewDefaultsFromConfigMap,
89+
GetFeatureFlagsConfigName(): NewFeatureFlagsFromConfigMap,
90+
GetMetricsConfigName(): NewMetricsFromConfigMap,
91+
GetSpireConfigName(): NewSpireConfigFromConfigMap,
92+
GetEventsConfigName(): NewEventsFromConfigMap,
93+
GetTracingConfigName(): NewTracingFromConfigMap,
94+
GetWaitExponentialBackoffConfigName(): NewWaitExponentialBackoffFromConfigMap,
9295
},
9396
onAfterStore...,
9497
),
@@ -129,13 +132,18 @@ func (s *Store) Load() *Config {
129132
if events == nil {
130133
events = DefaultEvents.DeepCopy()
131134
}
135+
waitExponentialBackoff := s.UntypedLoad(GetWaitExponentialBackoffConfigName())
136+
if waitExponentialBackoff == nil {
137+
waitExponentialBackoff = DefaultWaitExponentialBackoff.DeepCopy()
138+
}
132139

133140
return &Config{
134-
Defaults: defaults.(*Defaults).DeepCopy(),
135-
FeatureFlags: featureFlags.(*FeatureFlags).DeepCopy(),
136-
Metrics: metrics.(*Metrics).DeepCopy(),
137-
Tracing: tracing.(*Tracing).DeepCopy(),
138-
SpireConfig: spireconfig.(*sc.SpireConfig).DeepCopy(),
139-
Events: events.(*Events).DeepCopy(),
141+
Defaults: defaults.(*Defaults).DeepCopy(),
142+
FeatureFlags: featureFlags.(*FeatureFlags).DeepCopy(),
143+
Metrics: metrics.(*Metrics).DeepCopy(),
144+
Tracing: tracing.(*Tracing).DeepCopy(),
145+
SpireConfig: spireconfig.(*sc.SpireConfig).DeepCopy(),
146+
Events: events.(*Events).DeepCopy(),
147+
WaitExponentialBackoff: waitExponentialBackoff.(*WaitExponentialBackoff).DeepCopy(),
140148
}
141149
}

pkg/apis/config/store_test.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,21 +33,24 @@ func TestStoreLoadWithContext(t *testing.T) {
3333
spireConfig := test.ConfigMapFromTestFile(t, "config-spire")
3434
eventsConfig := test.ConfigMapFromTestFile(t, "config-events")
3535
tracingConfig := test.ConfigMapFromTestFile(t, "config-tracing")
36+
waitExponentialBackoffConfig := test.ConfigMapFromTestFile(t, "config-wait-exponential-backoff")
3637

3738
expectedDefaults, _ := config.NewDefaultsFromConfigMap(defaultConfig)
3839
expectedFeatures, _ := config.NewFeatureFlagsFromConfigMap(featuresConfig)
3940
metrics, _ := config.NewMetricsFromConfigMap(metricsConfig)
4041
expectedSpireConfig, _ := config.NewSpireConfigFromConfigMap(spireConfig)
4142
expectedEventsConfig, _ := config.NewEventsFromConfigMap(eventsConfig)
4243
expectedTracingConfig, _ := config.NewTracingFromConfigMap(tracingConfig)
44+
expectedWaitExponentialBackoffConfig, _ := config.NewWaitExponentialBackoffFromConfigMap(waitExponentialBackoffConfig)
4345

4446
expected := &config.Config{
45-
Defaults: expectedDefaults,
46-
FeatureFlags: expectedFeatures,
47-
Metrics: metrics,
48-
SpireConfig: expectedSpireConfig,
49-
Events: expectedEventsConfig,
50-
Tracing: expectedTracingConfig,
47+
Defaults: expectedDefaults,
48+
FeatureFlags: expectedFeatures,
49+
Metrics: metrics,
50+
SpireConfig: expectedSpireConfig,
51+
Events: expectedEventsConfig,
52+
Tracing: expectedTracingConfig,
53+
WaitExponentialBackoff: expectedWaitExponentialBackoffConfig,
5154
}
5255

5356
store := config.NewStore(logtesting.TestLogger(t))
@@ -57,6 +60,7 @@ func TestStoreLoadWithContext(t *testing.T) {
5760
store.OnConfigChanged(spireConfig)
5861
store.OnConfigChanged(eventsConfig)
5962
store.OnConfigChanged(tracingConfig)
63+
store.OnConfigChanged(waitExponentialBackoffConfig)
6064

6165
cfg := config.FromContext(store.ToContext(t.Context()))
6266

@@ -67,12 +71,13 @@ func TestStoreLoadWithContext(t *testing.T) {
6771

6872
func TestStoreLoadWithContext_Empty(t *testing.T) {
6973
want := &config.Config{
70-
Defaults: config.DefaultConfig.DeepCopy(),
71-
FeatureFlags: config.DefaultFeatureFlags.DeepCopy(),
72-
Metrics: config.DefaultMetrics.DeepCopy(),
73-
SpireConfig: config.DefaultSpire.DeepCopy(),
74-
Events: config.DefaultEvents.DeepCopy(),
75-
Tracing: config.DefaultTracing.DeepCopy(),
74+
Defaults: config.DefaultConfig.DeepCopy(),
75+
FeatureFlags: config.DefaultFeatureFlags.DeepCopy(),
76+
Metrics: config.DefaultMetrics.DeepCopy(),
77+
SpireConfig: config.DefaultSpire.DeepCopy(),
78+
Events: config.DefaultEvents.DeepCopy(),
79+
Tracing: config.DefaultTracing.DeepCopy(),
80+
WaitExponentialBackoff: config.DefaultWaitExponentialBackoff.DeepCopy(),
7681
}
7782

7883
store := config.NewStore(logtesting.TestLogger(t))
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: config-wait-exponential-backoff-custom
5+
data:
6+
duration: "5s"
7+
factor: "3.5"
8+
jitter: "0.2"
9+
steps: "7"
10+
cap: "120s"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: config-wait-exponential-backoff-empty
5+
data: {}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: config-wait-exponential-backoff
5+
data:
6+
duration: "1s"
7+
factor: "2.0"
8+
jitter: "0.0"
9+
steps: "10"
10+
cap: "30s"

0 commit comments

Comments
 (0)