Skip to content

Commit be77084

Browse files
Enrollment should not be infinite when triggered from action (#10946)
1 parent 4a3182e commit be77084

File tree

6 files changed

+115
-6
lines changed

6 files changed

+115
-6
lines changed

internal/pkg/agent/application/coordinator/coordinator.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,7 @@ func (c *Coordinator) Migrate(
682682
options,
683683
store,
684684
backoffFactory,
685+
3, // try enrollment for 3 times and fail
685686
)
686687
if err != nil {
687688
restoreErr := RestoreConfig()

internal/pkg/agent/application/enroll/enroll.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,9 @@ import (
3535
)
3636

3737
const (
38-
EnrollBackoffInit = time.Second * 5
39-
EnrollBackoffMax = time.Minute * 10
38+
EnrollBackoffInit = time.Second * 5
39+
EnrollBackoffMax = time.Minute * 10
40+
EnrollInfiniteAttempts = -1
4041

4142
maxRetriesstoreAgentInfo = 5
4243
defaultFleetServerHost = "0.0.0.0"
@@ -57,6 +58,7 @@ func EnrollWithBackoff(
5758
options EnrollOptions,
5859
configStore saver,
5960
backoffFactory func(done <-chan struct{}) backoff.Backoff,
61+
maxAttempts int,
6062
) error {
6163
if backoffFactory == nil {
6264
backoffFactory = func(done <-chan struct{}) backoff.Backoff {
@@ -92,26 +94,37 @@ func EnrollWithBackoff(
9294
signal := make(chan struct{})
9395
defer close(signal)
9496
backExp := backoffFactory(signal)
97+
enrollFn := func() error {
98+
return enroll(ctx, log, persistentConfig, client, options, configStore)
99+
}
100+
err = retryEnroll(err, maxAttempts, log, enrollFn, client.URI(), backExp)
101+
102+
return err
103+
}
104+
105+
func retryEnroll(err error, maxAttempts int, log *logger.Logger, enrollFn func() error, clientURI string, backExp backoff.Backoff) error {
106+
attemptNo := 1
95107

96108
RETRYLOOP:
97109
for {
110+
attemptNo++
98111
switch {
99112
case errors.Is(err, fleetapi.ErrTooManyRequests):
100113
log.Warn("Too many requests on the remote server, will retry in a moment.")
101114
case errors.Is(err, fleetapi.ErrConnRefused):
102115
log.Warn("Remote server is not ready to accept connections(Connection Refused), will retry in a moment.")
103116
case errors.Is(err, fleetapi.ErrTemporaryServerError):
104117
log.Warnf("Remote server failed to handle the request(%s), will retry in a moment.", err.Error())
105-
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), err == nil:
118+
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), errors.Is(err, fleetapi.ErrInvalidToken), err == nil, (maxAttempts != EnrollInfiniteAttempts && attemptNo > maxAttempts):
106119
break RETRYLOOP
107120
case err != nil:
108121
log.Warnf("Error detected: %s, will retry in a moment.", err.Error())
109122
}
110123
if !backExp.Wait() {
111124
break RETRYLOOP
112125
}
113-
log.Infof("Retrying enrollment to URL: %s", client.URI())
114-
err = enroll(ctx, log, persistentConfig, client, options, configStore)
126+
log.Infof("Retrying enrollment to URL: %s", clientURI)
127+
err = enrollFn()
115128
}
116129

117130
return err
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
2+
// or more contributor license agreements. Licensed under the Elastic License 2.0;
3+
// you may not use this file except in compliance with the Elastic License 2.0.
4+
5+
package enroll
6+
7+
import (
8+
"context"
9+
"errors"
10+
"fmt"
11+
"testing"
12+
"time"
13+
14+
"github.com/stretchr/testify/require"
15+
16+
"github.com/elastic/elastic-agent/pkg/core/logger"
17+
)
18+
19+
// fakeBackoff allows controlling the sequence of Wait() return values for tests.
20+
type fakeBackoff struct {
21+
results []bool
22+
}
23+
24+
func (f *fakeBackoff) Wait() bool {
25+
return true
26+
}
27+
28+
func (f *fakeBackoff) NextWait() (d time.Duration) { return 0 }
29+
func (f *fakeBackoff) Reset() {}
30+
31+
func TestRetryEnroll_SucceedsAfterOneRetry(t *testing.T) {
32+
// initial error forces at least one retry
33+
initialErr := errors.New("initial failure")
34+
35+
called := 0
36+
enrollFn := func() error {
37+
called++
38+
// succeed on first retry
39+
return nil
40+
}
41+
42+
fb := &fakeBackoff{results: []bool{true}}
43+
44+
l := logger.NewWithoutConfig("")
45+
46+
err := retryEnroll(initialErr, 5, l, enrollFn, "http://localhost", fb)
47+
require.NoError(t, err)
48+
require.Equal(t, 1, called)
49+
}
50+
51+
func TestRetryEnroll_BackoffStopsImmediately(t *testing.T) {
52+
initialErr := fmt.Errorf("network")
53+
called := 0
54+
expectedAttempts := 5
55+
enrollFn := func() error {
56+
called++
57+
return errors.New("still failing")
58+
}
59+
60+
fb := &fakeBackoff{results: []bool{false}}
61+
62+
l := logger.NewWithoutConfig("")
63+
64+
err := retryEnroll(initialErr, expectedAttempts, l, enrollFn, "http://localhost", fb)
65+
require.Equal(t, expectedAttempts-1, called)
66+
require.Error(t, err) // error is expected
67+
require.NotErrorIs(t, err, initialErr) // subsequent failures are different
68+
}
69+
70+
func TestRetryEnroll_BreaksOnContextCanceled(t *testing.T) {
71+
// When err is context.Canceled, retryEnroll should return immediately
72+
cancelErr := context.Canceled
73+
called := 0
74+
enrollFn := func() error {
75+
called++
76+
return nil
77+
}
78+
fb := &fakeBackoff{results: []bool{true}}
79+
80+
l := logger.NewWithoutConfig("")
81+
82+
err := retryEnroll(cancelErr, 5, l, enrollFn, "http://localhost", fb)
83+
require.ErrorIs(t, err, context.Canceled)
84+
require.Equal(t, called, 0)
85+
}

internal/pkg/agent/cmd/enroll_cmd.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,9 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
176176
enrollDelay,
177177
*c.options,
178178
c.configStore,
179-
c.backoffFactory)
179+
c.backoffFactory,
180+
-1, // try indefinitely, let user cancel the action
181+
)
180182
if err != nil {
181183
return fmt.Errorf("fail to enroll: %w", err)
182184
}

internal/pkg/fleetapi/client/client.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ func ExtractError(resp io.Reader) error {
117117
if len(e.Message) == 0 {
118118
return fmt.Errorf("status code: %d, fleet-server returned an error: %s", e.StatusCode, e.Error)
119119
}
120+
120121
return fmt.Errorf(
121122
"status code: %d, fleet-server returned an error: %s, message: %s",
122123
e.StatusCode,

internal/pkg/fleetapi/enroll_cmd.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ var ErrConnRefused = errors.New("connection refused")
3333
// ErrTemporaryServerError is returned when the request caused a temporary server error
3434
var ErrTemporaryServerError = errors.New("temporary server error, please retry later")
3535

36+
// ErrInvalidToken is returned when client is not authorized to perform enrollment.
37+
var ErrInvalidToken error = errors.New("invalid enrollment token")
38+
3639
// temporaryServerErrorCodes defines status codes that allow clients to retry their request.
3740
var temporaryServerErrorCodes = map[int]string{
3841
http.StatusBadGateway: "BadGateway",
@@ -234,6 +237,10 @@ func (e *EnrollCmd) Execute(ctx context.Context, r *EnrollRequest) (*EnrollRespo
234237
return nil, ErrTooManyRequests
235238
}
236239

240+
if resp.StatusCode == http.StatusUnauthorized {
241+
return nil, ErrInvalidToken
242+
}
243+
237244
if status, temporary := temporaryServerErrorCodes[resp.StatusCode]; temporary {
238245
return nil, fmt.Errorf("received status code %d (%s): %w", resp.StatusCode, status, ErrTemporaryServerError)
239246
}

0 commit comments

Comments
 (0)