Skip to content

Commit be748c2

Browse files
authored
Dynamically customizable timeout for envd init (#1265)
1 parent 11cebd3 commit be748c2

File tree

10 files changed

+137
-50
lines changed

10 files changed

+137
-50
lines changed

packages/orchestrator/internal/metrics/sandboxes.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ func (so *SandboxObserver) startObserving() (metric.Registration, error) {
197197
logger.WithSandboxID(sbx.Runtime.SandboxID),
198198
logger.WithTeamID(sbx.Runtime.TeamID),
199199
logger.WithTemplateID(sbx.Runtime.TemplateID),
200-
zap.String("envd_version", sbx.Config.Envd.Version),
200+
logger.WithEnvdVersion(sbx.Config.Envd.Version),
201201
zap.Time("sandbox_start", sbx.StartedAt),
202202
zap.Int64("clock_host", hostTm),
203203
zap.Int64("clock_sbx", sbxTm),

packages/orchestrator/internal/sandbox/envd.go

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,31 @@ import (
99
"net/http"
1010
"time"
1111

12+
"go.opentelemetry.io/otel/attribute"
13+
"go.opentelemetry.io/otel/metric"
14+
"go.opentelemetry.io/otel/trace"
15+
"go.uber.org/zap"
16+
1217
"github.com/e2b-dev/infra/packages/shared/pkg/consts"
18+
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
19+
"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
1320
)
1421

1522
const (
16-
requestTimeout = 50 * time.Millisecond
17-
loopDelay = 5 * time.Millisecond
23+
loopDelay = 5 * time.Millisecond
1824
)
1925

2026
// doRequestWithInfiniteRetries does a request with infinite retries until the context is done.
2127
// The parent context should have a deadline or a timeout.
22-
func doRequestWithInfiniteRetries(ctx context.Context, method, address string, requestBody []byte, accessToken *string) (*http.Response, error) {
28+
func doRequestWithInfiniteRetries(ctx context.Context, method, address string, requestBody []byte, accessToken *string, envdInitRequestTimeout time.Duration, sandboxID, envdVersion string) (*http.Response, int64, error) {
29+
requestCount := int64(0)
2330
for {
24-
reqCtx, cancel := context.WithTimeout(ctx, requestTimeout)
31+
requestCount++
32+
reqCtx, cancel := context.WithTimeout(ctx, envdInitRequestTimeout)
2533
request, err := http.NewRequestWithContext(reqCtx, method, address, bytes.NewReader(requestBody))
2634
if err != nil {
2735
cancel()
28-
return nil, err
36+
return nil, requestCount, err
2937
}
3038

3139
// make sure request to already authorized envd will not fail
@@ -38,12 +46,14 @@ func doRequestWithInfiniteRetries(ctx context.Context, method, address string, r
3846
cancel()
3947

4048
if err == nil {
41-
return response, nil
49+
return response, requestCount, nil
4250
}
4351

52+
zap.L().Warn("failed to do request to envd, retrying", logger.WithSandboxID(sandboxID), logger.WithEnvdVersion(envdVersion), zap.Int64("timeout_ms", envdInitRequestTimeout.Milliseconds()), zap.Error(err))
53+
4454
select {
4555
case <-ctx.Done():
46-
return nil, fmt.Errorf("%w with cause: %w", ctx.Err(), context.Cause(ctx))
56+
return nil, requestCount, fmt.Errorf("%w with cause: %w", ctx.Err(), context.Cause(ctx))
4757
case <-time.After(loopDelay):
4858
}
4959
}
@@ -56,17 +66,21 @@ type PostInitJSONBody struct {
5666
Timestamp *time.Time `json:"timestamp,omitempty"`
5767
}
5868

59-
func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, accessToken *string) error {
60-
childCtx, childSpan := tracer.Start(ctx, "envd-init")
61-
defer childSpan.End()
69+
func (s *Sandbox) initEnvd(ctx context.Context) error {
70+
ctx, span := tracer.Start(ctx, "envd-init", trace.WithAttributes(telemetry.WithEnvdVersion(s.Config.Envd.Version)))
71+
defer span.End()
72+
73+
attributes := []attribute.KeyValue{telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds())}
74+
attributesFail := append(attributes, attribute.Bool("success", false))
75+
attributesSuccess := append(attributes, attribute.Bool("success", true))
6276

6377
hyperloopIP := s.Slot.HyperloopIPString()
6478
address := fmt.Sprintf("http://%s:%d/init", s.Slot.HostIPString(), consts.DefaultEnvdServerPort)
6579
now := time.Now()
6680
jsonBody := &PostInitJSONBody{
67-
EnvVars: &envVars,
81+
EnvVars: &s.Config.Envd.Vars,
6882
HyperloopIP: &hyperloopIP,
69-
AccessToken: accessToken,
83+
AccessToken: s.Config.Envd.AccessToken,
7084
Timestamp: &now,
7185
}
7286

@@ -75,11 +89,20 @@ func (s *Sandbox) initEnvd(ctx context.Context, envVars map[string]string, acces
7589
return err
7690
}
7791

78-
response, err := doRequestWithInfiniteRetries(childCtx, "POST", address, body, accessToken)
92+
response, count, err := doRequestWithInfiniteRetries(ctx, "POST", address, body, s.Config.Envd.AccessToken, s.internalConfig.EnvdInitRequestTimeout, s.Runtime.SandboxID, s.Config.Envd.Version)
7993
if err != nil {
94+
envdInitCalls.Add(ctx, count, metric.WithAttributes(attributesFail...))
8095
return fmt.Errorf("failed to init envd: %w", err)
8196
}
8297

98+
if count > 1 {
99+
// Track failed envd init calls
100+
envdInitCalls.Add(ctx, count-1, metric.WithAttributes(attributesFail...))
101+
}
102+
103+
// Track successful envd init
104+
envdInitCalls.Add(ctx, 1, metric.WithAttributes(attributesSuccess...))
105+
83106
defer response.Body.Close()
84107
if response.StatusCode != http.StatusNoContent {
85108
return fmt.Errorf("unexpected status code: %d", response.StatusCode)

packages/orchestrator/internal/sandbox/sandbox.go

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@ import (
88
"time"
99

1010
"github.com/google/uuid"
11+
"go.opentelemetry.io/otel"
1112
"go.opentelemetry.io/otel/attribute"
1213
"go.opentelemetry.io/otel/codes"
14+
"go.opentelemetry.io/otel/metric"
1315
"go.opentelemetry.io/otel/trace"
1416
"go.uber.org/zap"
1517

@@ -32,7 +34,12 @@ import (
3234
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
3335
)
3436

35-
var defaultEnvdTimeout = utils.Must(time.ParseDuration(env.GetEnv("ENVD_TIMEOUT", "10s")))
37+
var (
38+
defaultEnvdTimeout = utils.Must(time.ParseDuration(env.GetEnv("ENVD_TIMEOUT", "10s")))
39+
meter = otel.GetMeterProvider().Meter("orchestrator.internal.sandbox")
40+
envdInitCalls = utils.Must(telemetry.GetCounter(meter, telemetry.EnvdInitCalls))
41+
waitForEnvdDurationHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.WaitForEnvdDurationHistogramName))
42+
)
3643

3744
var httpClient = http.Client{
3845
Timeout: 10 * time.Second,
@@ -76,9 +83,14 @@ type Resources struct {
7683
memory uffd.MemoryBackend
7784
}
7885

86+
type internalConfig struct {
87+
EnvdInitRequestTimeout time.Duration
88+
}
89+
7990
type Metadata struct {
80-
Config Config
81-
Runtime RuntimeMetadata
91+
internalConfig internalConfig
92+
Config Config
93+
Runtime RuntimeMetadata
8294

8395
StartedAt time.Time
8496
EndAt time.Time
@@ -272,6 +284,10 @@ func (f *Factory) CreateSandbox(
272284
}
273285

274286
metadata := &Metadata{
287+
internalConfig: internalConfig{
288+
EnvdInitRequestTimeout: f.GetEnvdInitRequestTimeout(ctx),
289+
},
290+
275291
Config: config,
276292
Runtime: runtime,
277293

@@ -506,6 +522,10 @@ func (f *Factory) ResumeSandbox(
506522
}
507523

508524
metadata := &Metadata{
525+
internalConfig: internalConfig{
526+
EnvdInitRequestTimeout: f.GetEnvdInitRequestTimeout(ctx),
527+
},
528+
509529
Config: config,
510530
Runtime: runtime,
511531

@@ -957,13 +977,19 @@ func (s *Sandbox) WaitForEnvd(
957977
ctx context.Context,
958978
timeout time.Duration,
959979
) (e error) {
980+
start := time.Now()
960981
ctx, span := tracer.Start(ctx, "sandbox-wait-for-start")
961982
defer span.End()
962983

963984
defer func() {
964985
if e != nil {
965986
return
966987
}
988+
duration := time.Since(start).Milliseconds()
989+
waitForEnvdDurationHistogram.Record(ctx, duration, metric.WithAttributes(
990+
telemetry.WithEnvdVersion(s.Config.Envd.Version),
991+
attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()),
992+
))
967993
// Update the sandbox as started now
968994
s.Metadata.StartedAt = time.Now()
969995
}()
@@ -984,11 +1010,19 @@ func (s *Sandbox) WaitForEnvd(
9841010
}
9851011
}()
9861012

987-
if err := s.initEnvd(ctx, s.Config.Envd.Vars, s.Config.Envd.AccessToken); err != nil {
1013+
if err := s.initEnvd(ctx); err != nil {
9881014
return fmt.Errorf("failed to init new envd: %w", err)
9891015
}
9901016

9911017
telemetry.ReportEvent(ctx, fmt.Sprintf("[sandbox %s]: initialized new envd", s.Metadata.Runtime.SandboxID))
9921018

9931019
return nil
9941020
}
1021+
1022+
func (f *Factory) GetEnvdInitRequestTimeout(ctx context.Context) time.Duration {
1023+
envdInitRequestTimeoutMs, err := f.featureFlags.IntFlag(ctx, featureflags.EnvdInitTimeoutSeconds)
1024+
if err != nil {
1025+
zap.L().Warn("failed to get envd timeout from feature flag, using default", zap.Error(err))
1026+
}
1027+
return time.Duration(envdInitRequestTimeoutMs) * time.Millisecond
1028+
}

packages/orchestrator/internal/template/build/phases/base/builder.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -290,14 +290,18 @@ func (bb *BaseBuilder) buildLayerFromOCI(
290290

291291
templateProvider := layer.NewDirectSourceTemplateProvider(localTemplate)
292292

293-
baseLayer, err := bb.layerExecutor.BuildLayer(ctx, userLogger, layer.LayerBuildCommand{
294-
SourceTemplate: templateProvider,
295-
CurrentLayer: baseMetadata,
296-
Hash: hash,
297-
UpdateEnvd: false,
298-
SandboxCreator: sandboxCreator,
299-
ActionExecutor: actionExecutor,
300-
})
293+
baseLayer, err := bb.layerExecutor.BuildLayer(
294+
ctx,
295+
userLogger,
296+
layer.LayerBuildCommand{
297+
SourceTemplate: templateProvider,
298+
CurrentLayer: baseMetadata,
299+
Hash: hash,
300+
UpdateEnvd: false,
301+
SandboxCreator: sandboxCreator,
302+
ActionExecutor: actionExecutor,
303+
},
304+
)
301305
if err != nil {
302306
return metadata.Template{}, fmt.Errorf("error building base layer: %w", err)
303307
}

packages/orchestrator/internal/template/build/phases/finalize/builder.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,18 @@ func (ppb *PostProcessingBuilder) Build(
142142

143143
templateProvider := layer.NewCacheSourceTemplateProvider(sourceLayer.Metadata.Template)
144144

145-
finalLayer, err := ppb.layerExecutor.BuildLayer(ctx, userLogger, layer.LayerBuildCommand{
146-
SourceTemplate: templateProvider,
147-
CurrentLayer: currentLayer.Metadata,
148-
Hash: currentLayer.Hash,
149-
UpdateEnvd: sourceLayer.Cached,
150-
SandboxCreator: sandboxCreator,
151-
ActionExecutor: actionExecutor,
152-
})
145+
finalLayer, err := ppb.layerExecutor.BuildLayer(
146+
ctx,
147+
userLogger,
148+
layer.LayerBuildCommand{
149+
SourceTemplate: templateProvider,
150+
CurrentLayer: currentLayer.Metadata,
151+
Hash: currentLayer.Hash,
152+
UpdateEnvd: sourceLayer.Cached,
153+
SandboxCreator: sandboxCreator,
154+
ActionExecutor: actionExecutor,
155+
},
156+
)
153157
if err != nil {
154158
return phases.LayerResult{}, fmt.Errorf("error running start and ready commands in sandbox: %w", err)
155159
}

packages/orchestrator/internal/template/build/phases/steps/builder.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -208,14 +208,18 @@ func (sb *StepBuilder) Build(
208208

209209
templateProvider := layer.NewCacheSourceTemplateProvider(sourceLayer.Metadata.Template)
210210

211-
meta, err := sb.layerExecutor.BuildLayer(ctx, userLogger, layer.LayerBuildCommand{
212-
SourceTemplate: templateProvider,
213-
CurrentLayer: currentLayer.Metadata,
214-
Hash: currentLayer.Hash,
215-
UpdateEnvd: sourceLayer.Cached,
216-
SandboxCreator: sandboxCreator,
217-
ActionExecutor: actionExecutor,
218-
})
211+
meta, err := sb.layerExecutor.BuildLayer(
212+
ctx,
213+
userLogger,
214+
layer.LayerBuildCommand{
215+
SourceTemplate: templateProvider,
216+
CurrentLayer: currentLayer.Metadata,
217+
Hash: currentLayer.Hash,
218+
UpdateEnvd: sourceLayer.Cached,
219+
SandboxCreator: sandboxCreator,
220+
ActionExecutor: actionExecutor,
221+
},
222+
)
219223
if err != nil {
220224
return phases.LayerResult{}, fmt.Errorf("error building step %d: %w", sb.stepNumber, err)
221225
}

packages/shared/pkg/feature-flags/flags.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,5 @@ var (
8484
BestOfKMaxOvercommit = newIntFlag("best-of-k-max-overcommit", 400) // Default R=4 (stored as percentage, max over-commit ratio)
8585
BestOfKAlpha = newIntFlag("best-of-k-alpha", 50) // Default Alpha=0.5 (stored as percentage for int flag, current usage weight)
8686
PubsubQueueChannelSize = newIntFlag("pubsub-queue-channel-size", 8*1024) // size of the channel buffer used to queue incoming sandbox events
87+
EnvdInitTimeoutSeconds = newIntFlag("envd-init-request-timeout-milliseconds", 50) // Timeout for envd init request in milliseconds
8788
)

packages/shared/pkg/logger/fields.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,7 @@ func WithClusterID(clusterID uuid.UUID) zap.Field {
3232
func WithServiceInstanceID(instanceID string) zap.Field {
3333
return zap.String("service.instance.id", instanceID)
3434
}
35+
36+
func WithEnvdVersion(envdVersion string) zap.Field {
37+
return zap.String("envd.version", envdVersion)
38+
}

packages/shared/pkg/telemetry/fields.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ func WithTeamID(teamID string) attribute.KeyValue {
3636
return zapFieldToOTELAttribute(logger.WithTeamID(teamID))
3737
}
3838

39+
func WithEnvdVersion(envdVersion string) attribute.KeyValue {
40+
return zapFieldToOTELAttribute(logger.WithEnvdVersion(envdVersion))
41+
}
42+
3943
func zapFieldToOTELAttribute(f zap.Field) attribute.KeyValue {
4044
e := &ZapFieldToOTELAttributeEncoder{}
4145
f.AddTo(e)

packages/shared/pkg/telemetry/meters.go

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ const (
2424
SandboxCreateMeterName CounterType = "api.env.instance.started"
2525

2626
TeamSandboxCreated CounterType = "e2b.team.sandbox.created"
27+
28+
EnvdInitCalls CounterType = "orchestrator.sandbox.envd.init.calls"
2729
)
2830

2931
const (
@@ -61,6 +63,9 @@ const (
6163
BuildDurationHistogramName HistogramType = "template.build.duration"
6264
BuildPhaseDurationHistogramName HistogramType = "template.build.phase.duration"
6365
BuildStepDurationHistogramName HistogramType = "template.build.step.duration"
66+
67+
// Sandbox timing histograms
68+
WaitForEnvdDurationHistogramName HistogramType = "orchestrator.sandbox.envd.init.duration"
6469
)
6570

6671
const (
@@ -92,6 +97,7 @@ var counterDesc = map[CounterType]string{
9297
BuildResultCounterName: "Number of template build results",
9398
BuildCacheResultCounterName: "Number of build cache results",
9499
TeamSandboxCreated: "Counter of started sandboxes for the team in the interval",
100+
EnvdInitCalls: "Number of envd initialization calls",
95101
}
96102

97103
var counterUnits = map[CounterType]string{
@@ -100,6 +106,7 @@ var counterUnits = map[CounterType]string{
100106
BuildResultCounterName: "{build}",
101107
BuildCacheResultCounterName: "{layer}",
102108
TeamSandboxCreated: "{sandbox}",
109+
EnvdInitCalls: "1",
103110
}
104111

105112
var observableCounterDesc = map[ObservableCounterType]string{
@@ -233,17 +240,19 @@ func GetGaugeInt(meter metric.Meter, name GaugeIntType) (metric.Int64ObservableG
233240
}
234241

235242
var histogramDesc = map[HistogramType]string{
236-
BuildDurationHistogramName: "Time taken to build a template",
237-
BuildPhaseDurationHistogramName: "Time taken to build each phase of a template",
238-
BuildStepDurationHistogramName: "Time taken to build each step of a template",
239-
BuildRootfsSizeHistogramName: "Size of the built template rootfs in bytes",
243+
BuildDurationHistogramName: "Time taken to build a template",
244+
BuildPhaseDurationHistogramName: "Time taken to build each phase of a template",
245+
BuildStepDurationHistogramName: "Time taken to build each step of a template",
246+
BuildRootfsSizeHistogramName: "Size of the built template rootfs in bytes",
247+
WaitForEnvdDurationHistogramName: "Time taken for Envd to initialize successfully",
240248
}
241249

242250
var histogramUnits = map[HistogramType]string{
243-
BuildDurationHistogramName: "ms",
244-
BuildPhaseDurationHistogramName: "ms",
245-
BuildStepDurationHistogramName: "ms",
246-
BuildRootfsSizeHistogramName: "{By}",
251+
BuildDurationHistogramName: "ms",
252+
BuildPhaseDurationHistogramName: "ms",
253+
BuildStepDurationHistogramName: "ms",
254+
BuildRootfsSizeHistogramName: "{By}",
255+
WaitForEnvdDurationHistogramName: "ms",
247256
}
248257

249258
func GetHistogram(meter metric.Meter, name HistogramType) (metric.Int64Histogram, error) {

0 commit comments

Comments
 (0)