Skip to content

Commit bee7be0

Browse files
committed
Updatable timeout for envd init
1 parent ec8ed58 commit bee7be0

File tree

12 files changed

+64
-15
lines changed

12 files changed

+64
-15
lines changed

packages/orchestrator/internal/metrics/sandboxes.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ func (so *SandboxObserver) startObserving() (metric.Registration, error) {
197197
logger.WithSandboxID(sbx.Runtime.SandboxID),
198198
logger.WithTeamID(sbx.Runtime.TeamID),
199199
logger.WithTemplateID(sbx.Runtime.TemplateID),
200-
zap.String("envd_version", sbx.Config.Envd.Version),
200+
logger.WithEnvdVersion(sbx.Config.Envd.Version),
201201
zap.Time("sandbox_start", sbx.StartedAt),
202202
zap.Int64("clock_host", hostTm),
203203
zap.Int64("clock_sbx", sbxTm),

packages/orchestrator/internal/sandbox/envd.go

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,29 @@ import (
99
"net/http"
1010
"time"
1111

12+
"go.opentelemetry.io/otel/attribute"
13+
"go.opentelemetry.io/otel/metric"
14+
"go.opentelemetry.io/otel/trace"
15+
1216
"github.com/e2b-dev/infra/packages/shared/pkg/consts"
17+
"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
1318
)
1419

1520
const (
16-
requestTimeout = 50 * time.Millisecond
17-
loopDelay = 5 * time.Millisecond
21+
loopDelay = 5 * time.Millisecond
1822
)
1923

2024
// doRequestWithInfiniteRetries does a request with infinite retries until the context is done.
2125
// The parent context should have a deadline or a timeout.
22-
func doRequestWithInfiniteRetries(ctx context.Context, method, address string, requestBody []byte, accessToken *string) (*http.Response, error) {
26+
func doRequestWithInfiniteRetries(ctx context.Context, method, address string, requestBody []byte, accessToken *string, envdInitRequestTimeout time.Duration, sandboxID string) (*http.Response, int64, error) {
27+
requestCount := int64(0)
2328
for {
24-
reqCtx, cancel := context.WithTimeout(ctx, requestTimeout)
29+
requestCount++
30+
reqCtx, cancel := context.WithTimeout(ctx, envdInitRequestTimeout)
2531
request, err := http.NewRequestWithContext(reqCtx, method, address, bytes.NewReader(requestBody))
2632
if err != nil {
2733
cancel()
28-
return nil, err
34+
return nil, requestCount, err
2935
}
3036

3137
// make sure request to already authorized envd will not fail
@@ -38,12 +44,12 @@ func doRequestWithInfiniteRetries(ctx context.Context, method, address string, r
3844
cancel()
3945

4046
if err == nil {
41-
return response, nil
47+
return response, requestCount, nil
4248
}
4349

4450
select {
4551
case <-ctx.Done():
46-
return nil, fmt.Errorf("%w with cause: %w", ctx.Err(), context.Cause(ctx))
52+
return nil, requestCount, fmt.Errorf("%w with cause: %w", ctx.Err(), context.Cause(ctx))
4753
case <-time.After(loopDelay):
4854
}
4955
}
@@ -56,9 +62,11 @@ type PostInitJSONBody struct {
5662
Timestamp *time.Time `json:"timestamp,omitempty"`
5763
}
5864

59-
func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, accessToken *string) error {
60-
childCtx, childSpan := tracer.Start(ctx, "envd-init")
61-
defer childSpan.End()
65+
func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, accessToken *string, envdInitRequestTimeout time.Duration) error {
66+
ctx, span := tracer.Start(ctx, "envd-init", trace.WithAttributes(telemetry.WithEnvdVersion(s.Config.Envd.Version)))
67+
defer span.End()
68+
69+
attributes := metric.WithAttributes(telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", envdInitRequestTimeout.Milliseconds()))
6270

6371
hyperloopIP := s.Slot.HyperloopIPString()
6472
address := fmt.Sprintf("http://%s:%d/init", s.Slot.HostIPString(), consts.DefaultEnvdServerPort)
@@ -75,7 +83,8 @@ func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, acces
7583
return err
7684
}
7785

78-
response, err := doRequestWithInfiniteRetries(childCtx, "POST", address, body, accessToken)
86+
response, count, err := doRequestWithInfiniteRetries(ctx, "POST", address, body, accessToken, envdInitRequestTimeout, s.Runtime.SandboxID)
87+
envdInitAttempts.Add(ctx, count, attributes)
7988
if err != nil {
8089
return fmt.Errorf("failed to init envd: %w", err)
8190
}
@@ -90,5 +99,8 @@ func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, acces
9099
return err
91100
}
92101

102+
// Track successful envd init
103+
envdInitSuccess.Add(ctx, 1, attributes)
104+
93105
return nil
94106
}

packages/orchestrator/internal/sandbox/sandbox.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"time"
99

1010
"github.com/google/uuid"
11+
"go.opentelemetry.io/otel"
1112
"go.opentelemetry.io/otel/attribute"
1213
"go.opentelemetry.io/otel/codes"
1314
"go.opentelemetry.io/otel/trace"
@@ -32,7 +33,12 @@ import (
3233
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
3334
)
3435

35-
var defaultEnvdTimeout = utils.Must(time.ParseDuration(env.GetEnv("ENVD_TIMEOUT", "10s")))
36+
var (
37+
defaultEnvdTimeout = utils.Must(time.ParseDuration(env.GetEnv("ENVD_TIMEOUT", "10s")))
38+
meter = otel.GetMeterProvider().Meter("github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox")
39+
envdInitAttempts = utils.Must(telemetry.GetCounter(meter, telemetry.EnvdInitAttempts))
40+
envdInitSuccess = utils.Must(telemetry.GetCounter(meter, telemetry.EnvdInitSuccess))
41+
)
3642

3743
var httpClient = http.Client{
3844
Timeout: 10 * time.Second,
@@ -317,6 +323,7 @@ func ResumeSandbox(
317323
devicePool *nbd.DevicePool,
318324
useClickhouseMetrics bool,
319325
apiConfigToStore *orchestrator.SandboxConfig,
326+
envdInitRequestTimeout time.Duration,
320327
) (s *Sandbox, e error) {
321328
ctx, span := tracer.Start(ctx, "resume sandbox")
322329
defer func() { endSpan(span, e) }()
@@ -520,6 +527,7 @@ func ResumeSandbox(
520527
err = sbx.WaitForEnvd(
521528
ctx,
522529
defaultEnvdTimeout,
530+
envdInitRequestTimeout,
523531
)
524532
if err != nil {
525533
return nil, fmt.Errorf("failed to wait for sandbox start: %w", err)
@@ -931,6 +939,7 @@ func (s *Sandbox) WaitForExit(ctx context.Context) error {
931939
func (s *Sandbox) WaitForEnvd(
932940
ctx context.Context,
933941
timeout time.Duration,
942+
envdInitRequestTimeout time.Duration,
934943
) (e error) {
935944
ctx, span := tracer.Start(ctx, "sandbox-wait-for-start")
936945
defer span.End()
@@ -959,7 +968,7 @@ func (s *Sandbox) WaitForEnvd(
959968
}
960969
}()
961970

962-
if err := s.initEnvd(ctx, s.Config.Envd.Vars, s.Config.Envd.AccessToken); err != nil {
971+
if err := s.initEnvd(ctx, s.Config.Envd.Vars, s.Config.Envd.AccessToken, envdInitRequestTimeout); err != nil {
963972
return fmt.Errorf("failed to init new envd: %w", err)
964973
}
965974

packages/orchestrator/internal/server/sandboxes.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,13 @@ func (s *server) Create(ctx context.Context, req *orchestrator.SandboxCreateRequ
104104
return nil, fmt.Errorf("failed to get template snapshot data: %w", err)
105105
}
106106

107+
// Get timeout from feature flag
108+
envdInitRequestTimeoutMs, err := s.featureFlags.IntFlag(ctx, featureflags.EnvdInitTimeoutSeconds)
109+
if err != nil {
110+
zap.L().Warn("failed to get envd timeout from feature flag, using default", zap.Error(err))
111+
}
112+
envdInitRequestTimeout := time.Duration(envdInitRequestTimeoutMs) * time.Millisecond
113+
107114
sbx, err := sandbox.ResumeSandbox(
108115
ctx,
109116
s.networkPool,
@@ -136,6 +143,7 @@ func (s *server) Create(ctx context.Context, req *orchestrator.SandboxCreateRequ
136143
s.devicePool,
137144
metricsWriteFlag,
138145
req.Sandbox,
146+
envdInitRequestTimeout,
139147
)
140148
if err != nil {
141149
err := errors.Join(err, context.Cause(ctx))

packages/orchestrator/internal/template/build/layer/create_sandbox.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ func (cs *CreateSandbox) Sandbox(
100100
err = sbx.WaitForEnvd(
101101
ctx,
102102
waitEnvdTimeout,
103+
defaultEnvdInitRequestTimeout,
103104
)
104105
if err != nil {
105106
return nil, fmt.Errorf("wait for envd: %w", err)

packages/orchestrator/internal/template/build/layer/interfaces.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ import (
1010
)
1111

1212
const (
13-
waitEnvdTimeout = 60 * time.Second
13+
defaultEnvdInitRequestTimeout = 50 * time.Millisecond
14+
waitEnvdTimeout = 60 * time.Second
1415
)
1516

1617
// SandboxCreator creates sandboxes for layer building

packages/orchestrator/internal/template/build/layer/layer_executor.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ func (lb *LayerExecutor) updateEnvdInSandbox(
193193
err = sbx.WaitForEnvd(
194194
ctx,
195195
waitEnvdTimeout,
196+
defaultEnvdInitRequestTimeout,
196197
)
197198
if err != nil {
198199
return fmt.Errorf("failed to wait for envd initialization after update: %w", err)

packages/orchestrator/internal/template/build/layer/resume_sandbox.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ func (rs *ResumeSandbox) Sandbox(
4646
layerExecutor.devicePool,
4747
false,
4848
nil,
49+
defaultEnvdInitRequestTimeout,
4950
)
5051
if err != nil {
5152
return nil, fmt.Errorf("resume sandbox: %w", err)

packages/shared/pkg/feature-flags/flags.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,5 @@ var (
8484
BestOfKMaxOvercommit = newIntFlag("best-of-k-max-overcommit", 400) // Default R=4 (stored as percentage, max over-commit ratio)
8585
BestOfKAlpha = newIntFlag("best-of-k-alpha", 50) // Default Alpha=0.5 (stored as percentage for int flag, current usage weight)
8686
PubsubQueueChannelSize = newIntFlag("pubsub-queue-channel-size", 8*1024) // size of the channel buffer used to queue incoming sandbox events
87+
EnvdInitTimeoutSeconds = newIntFlag("envd-init-request-timeout-milliseconds", 50) // Timeout for envd init request in milliseconds
8788
)

packages/shared/pkg/logger/fields.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,7 @@ func WithClusterID(clusterID uuid.UUID) zap.Field {
3232
func WithServiceInstanceID(instanceID string) zap.Field {
3333
return zap.String("service.instance.id", instanceID)
3434
}
35+
36+
func WithEnvdVersion(envdVersion string) zap.Field {
37+
return zap.String("envd.version", envdVersion)
38+
}

0 commit comments

Comments
 (0)