Skip to content

Commit 0438b7f

Browse files
committed
Updatable timeout for envd init
1 parent ff5ea68 commit 0438b7f

File tree

9 files changed

+57
-13
lines changed

9 files changed

+57
-13
lines changed

packages/orchestrator/internal/sandbox/envd.go

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,31 @@ import (
99
"net/http"
1010
"time"
1111

12+
"go.opentelemetry.io/otel/attribute"
13+
"go.opentelemetry.io/otel/metric"
14+
"go.opentelemetry.io/otel/trace"
15+
"go.uber.org/zap"
16+
1217
"github.com/e2b-dev/infra/packages/shared/pkg/consts"
18+
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
1319
)
1420

1521
const (
16-
requestTimeout = 50 * time.Millisecond
17-
loopDelay = 5 * time.Millisecond
22+
loopDelay = 5 * time.Millisecond
1823
)
1924

2025
// doRequestWithInfiniteRetries does a request with infinite retries until the context is done.
2126
// The parent context should have a deadline or a timeout.
22-
func doRequestWithInfiniteRetries(ctx context.Context, method, address string, requestBody []byte, accessToken *string) (*http.Response, error) {
27+
func doRequestWithInfiniteRetries(ctx context.Context, method, address string, requestBody []byte, accessToken *string, envdInitRequestTimeout time.Duration, sandboxID string) (*http.Response, int64, error) {
28+
count := int64(0)
2329
for {
24-
reqCtx, cancel := context.WithTimeout(ctx, requestTimeout)
30+
count++
31+
start := time.Now()
32+
reqCtx, cancel := context.WithTimeout(ctx, envdInitRequestTimeout)
2533
request, err := http.NewRequestWithContext(reqCtx, method, address, bytes.NewReader(requestBody))
2634
if err != nil {
2735
cancel()
28-
return nil, err
36+
return nil, count, err
2937
}
3038

3139
// make sure request to already authorized envd will not fail
@@ -38,12 +46,13 @@ func doRequestWithInfiniteRetries(ctx context.Context, method, address string, r
3846
cancel()
3947

4048
if err == nil {
41-
return response, nil
49+
return response, count, nil
4250
}
4351

52+
zap.L().Error("Error requesting infinite retries", zap.Error(err), logger.WithSandboxID(sandboxID), zap.Int64("elapsed", time.Since(start).Milliseconds()))
4453
select {
4554
case <-ctx.Done():
46-
return nil, fmt.Errorf("%w with cause: %w", ctx.Err(), context.Cause(ctx))
55+
return nil, count, fmt.Errorf("%w with cause: %w", ctx.Err(), context.Cause(ctx))
4756
case <-time.After(loopDelay):
4857
}
4958
}
@@ -56,10 +65,12 @@ type PostInitJSONBody struct {
5665
Timestamp *time.Time `json:"timestamp,omitempty"`
5766
}
5867

59-
func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, accessToken *string) error {
60-
childCtx, childSpan := tracer.Start(ctx, "envd-init")
68+
func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, accessToken *string, envdInitRequestTimeout time.Duration) error {
69+
childCtx, childSpan := tracer.Start(ctx, "envd-init", trace.WithAttributes(attribute.String("evd_version", s.Config.Envd.Version)))
6170
defer childSpan.End()
6271

72+
attributes := metric.WithAttributes(attribute.String("envd_version", s.Config.Envd.Version), attribute.Int64("timeout_ms", envdInitRequestTimeout.Milliseconds()))
73+
6374
hyperloopIP := s.Slot.HyperloopIPString()
6475
address := fmt.Sprintf("http://%s:%d/init", s.Slot.HostIPString(), consts.DefaultEnvdServerPort)
6576
now := time.Now()
@@ -75,7 +86,8 @@ func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, acces
7586
return err
7687
}
7788

78-
response, err := doRequestWithInfiniteRetries(childCtx, "POST", address, body, accessToken)
89+
response, count, err := doRequestWithInfiniteRetries(childCtx, "POST", address, body, accessToken, envdInitRequestTimeout, s.Runtime.SandboxID)
90+
envdInitAttempts.Add(ctx, count, attributes)
7991
if err != nil {
8092
return fmt.Errorf("failed to init envd: %w", err)
8193
}
@@ -90,5 +102,8 @@ func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, acces
90102
return err
91103
}
92104

105+
// Track successful envd init
106+
envdInitSuccess.Add(ctx, 1, attributes)
107+
93108
return nil
94109
}

packages/orchestrator/internal/sandbox/sandbox.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"time"
99

1010
"github.com/google/uuid"
11+
"go.opentelemetry.io/otel"
1112
"go.opentelemetry.io/otel/attribute"
1213
"go.opentelemetry.io/otel/trace"
1314
"go.uber.org/zap"
@@ -31,7 +32,12 @@ import (
3132
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
3233
)
3334

34-
var defaultEnvdTimeout = utils.Must(time.ParseDuration(env.GetEnv("ENVD_TIMEOUT", "10s")))
35+
var (
36+
defaultEnvdTimeout = utils.Must(time.ParseDuration(env.GetEnv("ENVD_TIMEOUT", "10s")))
37+
meter = otel.GetMeterProvider().Meter("github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox")
38+
envdInitAttempts = utils.Must(telemetry.GetCounter(meter, telemetry.EnvdInitAttempts))
39+
envdInitSuccess = utils.Must(telemetry.GetCounter(meter, telemetry.EnvdInitSuccess))
40+
)
3541

3642
var httpClient = http.Client{
3743
Timeout: 10 * time.Second,
@@ -314,6 +320,7 @@ func ResumeSandbox(
314320
devicePool *nbd.DevicePool,
315321
useClickhouseMetrics bool,
316322
apiConfigToStore *orchestrator.SandboxConfig,
323+
envdInitRequestTimeout time.Duration,
317324
) (s *Sandbox, e error) {
318325
ctx, span, done := startSpan(ctx, "resume sandbox")
319326
defer func() { done(e) }()
@@ -522,6 +529,7 @@ func ResumeSandbox(
522529
err = sbx.WaitForEnvd(
523530
ctx,
524531
defaultEnvdTimeout,
532+
envdInitRequestTimeout,
525533
)
526534
if err != nil {
527535
return nil, fmt.Errorf("failed to wait for sandbox start: %w", err)
@@ -949,6 +957,7 @@ func (s *Sandbox) WaitForExit(ctx context.Context) error {
949957
func (s *Sandbox) WaitForEnvd(
950958
ctx context.Context,
951959
timeout time.Duration,
960+
envdInitRequestTimeout time.Duration,
952961
) (e error) {
953962
ctx, span := tracer.Start(ctx, "sandbox-wait-for-start")
954963
defer span.End()
@@ -977,7 +986,7 @@ func (s *Sandbox) WaitForEnvd(
977986
}
978987
}()
979988

980-
initErr := s.initEnvd(syncCtx, s.Config.Envd.Vars, s.Config.Envd.AccessToken)
989+
initErr := s.initEnvd(syncCtx, s.Config.Envd.Vars, s.Config.Envd.AccessToken, envdInitRequestTimeout)
981990
if initErr != nil {
982991
return fmt.Errorf("failed to init new envd: %w", initErr)
983992
}

packages/orchestrator/internal/server/sandboxes.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,13 @@ func (s *server) Create(ctx context.Context, req *orchestrator.SandboxCreateRequ
104104
return nil, fmt.Errorf("failed to get template snapshot data: %w", err)
105105
}
106106

107+
// Get timeout from feature flag
108+
envdInitRequestTimeoutMs, err := s.featureFlags.IntFlag(ctx, featureflags.EnvdInitTimeoutSeconds)
109+
if err != nil {
110+
zap.L().Warn("failed to get envd timeout from feature flag, using default", zap.Error(err))
111+
}
112+
envdInitRequestTimeout := time.Duration(envdInitRequestTimeoutMs) * time.Millisecond
113+
107114
sbx, err := sandbox.ResumeSandbox(
108115
ctx,
109116
s.networkPool,
@@ -136,6 +143,7 @@ func (s *server) Create(ctx context.Context, req *orchestrator.SandboxCreateRequ
136143
s.devicePool,
137144
metricsWriteFlag,
138145
req.Sandbox,
146+
envdInitRequestTimeout,
139147
)
140148
if err != nil {
141149
err := errors.Join(err, context.Cause(ctx))

packages/orchestrator/internal/template/build/layer/create_sandbox.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ func (cs *CreateSandbox) Sandbox(
100100
err = sbx.WaitForEnvd(
101101
ctx,
102102
waitEnvdTimeout,
103+
defaultEnvdInitRequestTimeout,
103104
)
104105
if err != nil {
105106
return nil, fmt.Errorf("wait for envd: %w", err)

packages/orchestrator/internal/template/build/layer/interfaces.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ import (
1010
)
1111

1212
const (
13-
waitEnvdTimeout = 60 * time.Second
13+
defaultEnvdInitRequestTimeout = 50 * time.Millisecond
14+
waitEnvdTimeout = 60 * time.Second
1415
)
1516

1617
// SandboxCreator creates sandboxes for layer building

packages/orchestrator/internal/template/build/layer/layer_executor.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ func (lb *LayerExecutor) updateEnvdInSandbox(
193193
err = sbx.WaitForEnvd(
194194
ctx,
195195
waitEnvdTimeout,
196+
defaultEnvdInitRequestTimeout,
196197
)
197198
if err != nil {
198199
return fmt.Errorf("failed to wait for envd initialization after update: %w", err)

packages/orchestrator/internal/template/build/layer/resume_sandbox.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ func (rs *ResumeSandbox) Sandbox(
4646
layerExecutor.devicePool,
4747
false,
4848
nil,
49+
defaultEnvdInitRequestTimeout,
4950
)
5051
if err != nil {
5152
return nil, fmt.Errorf("resume sandbox: %w", err)

packages/shared/pkg/feature-flags/flags.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,5 @@ var (
8484
BestOfKMaxOvercommit = newIntFlag("best-of-k-max-overcommit", 400) // Default R=4 (stored as percentage, max over-commit ratio)
8585
BestOfKAlpha = newIntFlag("best-of-k-alpha", 50) // Default Alpha=0.5 (stored as percentage for int flag, current usage weight)
8686
PubsubQueueChannelSize = newIntFlag("pubsub-queue-channel-size", 8*1024) // size of the channel buffer used to queue incoming sandbox events
87+
EnvdInitTimeoutSeconds = newIntFlag("envd-init-request-timeout-milliseconds", 50) // Timeout for envd init request in milliseconds
8788
)

packages/shared/pkg/telemetry/meters.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ const (
2424
SandboxCreateMeterName CounterType = "api.env.instance.started"
2525

2626
TeamSandboxCreated CounterType = "e2b.team.sandbox.created"
27+
28+
EnvdInitAttempts CounterType = "orchestrator.envd.init.attempts"
29+
EnvdInitSuccess CounterType = "orchestrator.envd.init.success"
2730
)
2831

2932
const (
@@ -92,6 +95,8 @@ var counterDesc = map[CounterType]string{
9295
BuildResultCounterName: "Number of template build results",
9396
BuildCacheResultCounterName: "Number of build cache results",
9497
TeamSandboxCreated: "Counter of started sandboxes for the team in the interval",
98+
EnvdInitAttempts: "Number of envd initialization attempts",
99+
EnvdInitSuccess: "Number of successful envd initializations",
95100
}
96101

97102
var counterUnits = map[CounterType]string{
@@ -100,6 +105,8 @@ var counterUnits = map[CounterType]string{
100105
BuildResultCounterName: "{build}",
101106
BuildCacheResultCounterName: "{layer}",
102107
TeamSandboxCreated: "{sandbox}",
108+
EnvdInitAttempts: "1",
109+
EnvdInitSuccess: "1",
103110
}
104111

105112
var observableCounterDesc = map[ObservableCounterType]string{

0 commit comments

Comments
 (0)