Skip to content

Commit 29bef8a

Browse files
committed
fix sandbox cleanup for template build
1 parent 8beee54 commit 29bef8a

File tree

10 files changed

+91
-67
lines changed

10 files changed

+91
-67
lines changed

packages/orchestrator/internal/sandbox/sandbox.go

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,9 @@ type RuntimeMetadata struct {
7171
}
7272

7373
type Resources struct {
74-
Slot *network.Slot
75-
rootfs rootfs.Provider
76-
memory uffd.MemoryBackend
77-
uffdExit chan error
74+
Slot *network.Slot
75+
rootfs rootfs.Provider
76+
memory uffd.MemoryBackend
7877
}
7978

8079
type Metadata struct {
@@ -99,6 +98,8 @@ type Sandbox struct {
9998
Checks *Checks
10099

101100
APIStoredConfig *orchestrator.SandboxConfig
101+
102+
exit *utils.SetOnce[struct{}]
102103
}
103104

104105
func (s *Sandbox) LoggerMetadata() sbxlogger.SandboxMetadata {
@@ -131,6 +132,8 @@ func CreateSandbox(
131132
childCtx, childSpan := tracer.Start(ctx, "create-sandbox")
132133
defer childSpan.End()
133134

135+
exit := utils.NewSetOnce[struct{}]()
136+
134137
cleanup := NewCleanup()
135138
defer func() {
136139
if e != nil {
@@ -248,10 +251,9 @@ func CreateSandbox(
248251
telemetry.ReportEvent(childCtx, "created fc process")
249252

250253
resources := &Resources{
251-
Slot: ips.slot,
252-
rootfs: rootfsProvider,
253-
memory: uffd.NewNoopMemory(memfileSize, memfile.BlockSize()),
254-
uffdExit: make(chan error, 1),
254+
Slot: ips.slot,
255+
rootfs: rootfsProvider,
256+
memory: uffd.NewNoopMemory(memfileSize, memfile.BlockSize()),
255257
}
256258

257259
metadata := &Metadata{
@@ -273,6 +275,8 @@ func CreateSandbox(
273275
cleanup: cleanup,
274276

275277
APIStoredConfig: apiConfigToStore,
278+
279+
exit: exit,
276280
}
277281

278282
checks, err := NewChecks(ctx, tracer, sbx, false)
@@ -282,9 +286,18 @@ func CreateSandbox(
282286
sbx.Checks = checks
283287

284288
cleanup.AddPriority(func(ctx context.Context) error {
285-
return sbx.Close(ctx, tracer)
289+
// Stop the sandbox first if it is still running, otherwise do nothing
290+
return sbx.Stop(ctx, tracer)
286291
})
287292

293+
go func() {
294+
// If the process exists, stop the sandbox properly
295+
_, fcErr := fcHandle.Exit.Wait()
296+
err := s.Stop(context.WithoutCancel(ctx), tracer)
297+
298+
exit.SetResult(struct{}{}, errors.Join(err, fcErr))
299+
}()
300+
288301
return sbx, nil
289302
}
290303

@@ -307,6 +320,8 @@ func ResumeSandbox(
307320
childCtx, childSpan := tracer.Start(ctx, "resume-sandbox")
308321
defer childSpan.End()
309322

323+
exit := utils.NewSetOnce[struct{}]()
324+
310325
cleanup := NewCleanup()
311326
defer func() {
312327
if e != nil {
@@ -379,18 +394,16 @@ func ResumeSandbox(
379394
return nil, fmt.Errorf("failed to serve memory: %w", err)
380395
}
381396

397+
// ==== END of resources initialization ====
382398
uffdStartCtx, cancelUffdStartCtx := context.WithCancelCause(ctx)
383399
defer cancelUffdStartCtx(fmt.Errorf("uffd finished starting"))
384400

385-
uffdExit := make(chan error, 1)
386401
go func() {
387-
uffdWaitErr := <-fcUffd.Exit()
388-
uffdExit <- uffdWaitErr
402+
_, uffdWaitErr := fcUffd.Exit().Wait()
389403

390404
cancelUffdStartCtx(fmt.Errorf("uffd process exited: %w", errors.Join(uffdWaitErr, context.Cause(uffdStartCtx))))
391405
}()
392406

393-
// / ==== END of resources initialization ====
394407
rootfsPath, err := rootfsOverlay.Path()
395408
if err != nil {
396409
return nil, fmt.Errorf("failed to get rootfs path: %w", err)
@@ -452,10 +465,9 @@ func ResumeSandbox(
452465
telemetry.ReportEvent(childCtx, "initialized FC")
453466

454467
resources := &Resources{
455-
Slot: ips.slot,
456-
rootfs: rootfsOverlay,
457-
memory: fcUffd,
458-
uffdExit: uffdExit,
468+
Slot: ips.slot,
469+
rootfs: rootfsOverlay,
470+
memory: fcUffd,
459471
}
460472

461473
metadata := &Metadata{
@@ -477,6 +489,8 @@ func ResumeSandbox(
477489
cleanup: cleanup,
478490

479491
APIStoredConfig: apiConfigToStore,
492+
493+
exit: exit,
480494
}
481495

482496
// Part of the sandbox as we need to stop Checks before pausing the sandbox
@@ -489,7 +503,8 @@ func ResumeSandbox(
489503
sbx.Checks = checks
490504

491505
cleanup.AddPriority(func(ctx context.Context) error {
492-
return sbx.Close(ctx, tracer)
506+
// Stop the sandbox first if it is still running, otherwise do nothing
507+
return sbx.Stop(ctx, tracer)
493508
})
494509

495510
err = sbx.WaitForEnvd(
@@ -503,40 +518,38 @@ func ResumeSandbox(
503518

504519
go sbx.Checks.Start()
505520

506-
return sbx, nil
507-
}
521+
go func() {
522+
// Wait for either uffd or fc process to exit
523+
select {
524+
case <-fcUffd.Exit().Done:
525+
case <-fcHandle.Exit.Done:
526+
}
508527

509-
func (s *Sandbox) Wait(ctx context.Context) error {
510-
select {
511-
case <-ctx.Done():
512-
return ctx.Err()
513-
case <-s.process.Exit.Done:
514-
_, fcErr := s.process.Exit.Result()
515-
stopErr := s.Stop(ctx)
516-
uffdErr := <-s.uffdExit
528+
err := s.Stop(context.WithoutCancel(ctx), tracer)
517529

518-
return errors.Join(fcErr, stopErr, uffdErr)
519-
case uffdErr := <-s.uffdExit:
520-
stopErr := s.Stop(ctx)
530+
_, uffdWaitErr := fcUffd.Exit().Wait()
531+
_, fcErr := fcHandle.Exit.Wait()
532+
exit.SetResult(struct{}{}, errors.Join(err, fcErr, uffdWaitErr))
533+
}()
521534

522-
_, fcErr := s.process.Exit.WaitWithContext(ctx)
535+
return sbx, nil
536+
}
523537

524-
return errors.Join(uffdErr, stopErr, fcErr)
525-
}
538+
func (s *Sandbox) Wait(ctx context.Context) error {
539+
_, err := s.exit.WaitWithContext(ctx)
540+
return err
526541
}
527542

528-
// Stop starts the cleanup process for the sandbox.
529-
func (s *Sandbox) Stop(ctx context.Context) error {
543+
func (s *Sandbox) Close(ctx context.Context) error {
530544
err := s.cleanup.Run(ctx)
531545
if err != nil {
532-
sbxlogger.I(s).Error("failed to stop sandbox", zap.Error(err))
533-
return fmt.Errorf("failed to stop sandbox: %w", err)
546+
return fmt.Errorf("failed to cleanup sandbox: %w", err)
534547
}
535-
536548
return nil
537549
}
538550

539-
func (s *Sandbox) Close(ctx context.Context, tracer trace.Tracer) error {
551+
// Stop kills the sandbox.
552+
func (s *Sandbox) Stop(ctx context.Context, tracer trace.Tracer) error {
540553
_, span := tracer.Start(ctx, "sandbox-close")
541554
defer span.End()
542555

@@ -659,7 +672,7 @@ func (s *Sandbox) Pause(
659672
originalRootfs.Header(),
660673
&RootfsDiffCreator{
661674
rootfs: s.rootfs,
662-
stopHook: s.Stop,
675+
stopHook: s.Close,
663676
},
664677
)
665678
if err != nil {
@@ -894,8 +907,8 @@ func (s *Sandbox) WaitForExit(
894907
return fmt.Errorf("waiting for exit took too long")
895908
case <-ctx.Done():
896909
return nil
897-
case <-s.process.Exit.Done:
898-
_, err := s.process.Exit.Result()
910+
case <-s.exit.Done:
911+
_, err := s.exit.Result()
899912
if err == nil {
900913
return nil
901914
}

packages/orchestrator/internal/sandbox/uffd/handler.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox/uffd/fdexit"
1717
"github.com/e2b-dev/infra/packages/orchestrator/internal/sandbox/uffd/mapping"
1818
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
19+
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
1920
)
2021

2122
const (
@@ -25,7 +26,7 @@ const (
2526
)
2627

2728
type Uffd struct {
28-
exitCh chan error
29+
exit *utils.SetOnce[struct{}]
2930
readyCh chan struct{}
3031

3132
fdExit *fdexit.FdExit
@@ -48,7 +49,7 @@ func New(memfile block.ReadonlyDevice, socketPath string, blockSize int64) (*Uff
4849
}
4950

5051
return &Uffd{
51-
exitCh: make(chan error, 1),
52+
exit: utils.NewSetOnce[struct{}](),
5253
readyCh: make(chan struct{}, 1),
5354
fdExit: fdExit,
5455
memfile: trackedMemfile,
@@ -75,10 +76,9 @@ func (u *Uffd) Start(sandboxId string) error {
7576
closeErr := u.lis.Close()
7677
fdExitErr := u.fdExit.Close()
7778

78-
u.exitCh <- errors.Join(handleErr, closeErr, fdExitErr)
79+
u.exit.SetResult(struct{}{}, errors.Join(handleErr, closeErr, fdExitErr))
7980

8081
close(u.readyCh)
81-
close(u.exitCh)
8282
}()
8383

8484
return nil
@@ -165,8 +165,8 @@ func (u *Uffd) Ready() chan struct{} {
165165
return u.readyCh
166166
}
167167

168-
func (u *Uffd) Exit() chan error {
169-
return u.exitCh
168+
func (u *Uffd) Exit() *utils.SetOnce[struct{}] {
169+
return u.exit
170170
}
171171

172172
func (u *Uffd) TrackAndReturnNil() error {
Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
package uffd
22

3-
import "github.com/bits-and-blooms/bitset"
3+
import (
4+
"github.com/bits-and-blooms/bitset"
5+
6+
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
7+
)
48

59
type MemoryBackend interface {
610
Disable() error
@@ -9,5 +13,5 @@ type MemoryBackend interface {
913
Start(sandboxId string) error
1014
Stop() error
1115
Ready() chan struct{}
12-
Exit() chan error
16+
Exit() *utils.SetOnce[struct{}]
1317
}

packages/orchestrator/internal/sandbox/uffd/noop.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"github.com/bits-and-blooms/bitset"
55

66
"github.com/e2b-dev/infra/packages/shared/pkg/storage/header"
7+
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
78
)
89

910
type NoopMemory struct {
@@ -48,6 +49,6 @@ func (m *NoopMemory) Ready() chan struct{} {
4849
return ch
4950
}
5051

51-
func (m *NoopMemory) Exit() chan error {
52-
return make(chan error)
52+
func (m *NoopMemory) Exit() *utils.SetOnce[struct{}] {
53+
return utils.NewSetOnce[struct{}]()
5354
}

packages/orchestrator/internal/server/sandboxes.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ func (s *server) Create(ctx context.Context, req *orchestrator.SandboxCreateRequ
138138
sbxlogger.I(sbx).Error("failed to wait for sandbox, cleaning up", zap.Error(waitErr))
139139
}
140140

141-
cleanupErr := sbx.Stop(ctx)
141+
cleanupErr := sbx.Close(ctx)
142142
if cleanupErr != nil {
143143
sbxlogger.I(sbx).Error("failed to cleanup sandbox, will remove from cache", zap.Error(cleanupErr))
144144
}
@@ -304,7 +304,7 @@ func (s *server) Delete(ctxConn context.Context, in *orchestrator.SandboxDeleteR
304304
// Start the cleanup in a goroutine—the initial kill request should be send as the first thing in stop, and at this point you cannot route to the sandbox anymore.
305305
// We don't wait for the whole cleanup to finish here.
306306
go func() {
307-
err := sbx.Stop(ctx)
307+
err := sbx.Close(ctx)
308308
if err != nil {
309309
sbxlogger.I(sbx).Error("error stopping sandbox", logger.WithSandboxID(in.SandboxId), zap.Error(err))
310310
}
@@ -372,7 +372,7 @@ func (s *server) Pause(ctx context.Context, in *orchestrator.SandboxPauseRequest
372372
ctx, childSpan := s.tracer.Start(ctx, "sandbox-pause-stop")
373373
defer childSpan.End()
374374

375-
err := sbx.Stop(ctx)
375+
err := sbx.Close(ctx)
376376
if err != nil {
377377
sbxlogger.I(sbx).Error("error stopping sandbox after snapshot", logger.WithSandboxID(in.SandboxId), zap.Error(err))
378378
}

packages/orchestrator/internal/template/build/core/filesystem/ext4.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,15 @@ func ReadFile(ctx context.Context, tracer trace.Tracer, rootfsPath string, fileP
229229
_, statSpan := tracer.Start(ctx, "ext4-read-file")
230230
defer statSpan.End()
231231

232+
_, err := os.Stat(rootfsPath)
233+
if err != nil {
234+
return "1", fmt.Errorf("rootfs file does not exist: %w", err)
235+
}
236+
232237
cmd := exec.Command("debugfs", "-R", fmt.Sprintf("cat \"%s\"", filePath), rootfsPath)
233-
out, err := cmd.Output()
238+
out, err := cmd.CombinedOutput()
234239
if err != nil {
235-
return "1", fmt.Errorf("error reading file: %w", err)
240+
return "1", fmt.Errorf("error reading file %s: %w", filePath, err)
236241
}
237242

238243
return string(out), nil

packages/orchestrator/internal/template/build/layer/layer_executor.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,7 @@ func (lb *LayerExecutor) BuildLayer(
9292
if err != nil {
9393
return metadata.Template{}, err
9494
}
95-
defer sbx.Stop(ctx)
96-
go func() {
97-
err := sbx.Wait(context.WithoutCancel(ctx))
98-
if err != nil {
99-
lb.logger.Error("error waiting for sandbox", zap.Error(err))
100-
}
101-
}()
95+
defer sbx.Close(ctx)
10296

10397
// Add to proxy so we can call envd commands
10498
lb.sandboxes.Insert(sbx.Runtime.SandboxID, sbx)

packages/orchestrator/internal/template/build/phases/base/builder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ func (bb *BaseBuilder) buildLayerFromOCI(
284284
if err != nil {
285285
return metadata.Template{}, fmt.Errorf("error creating sandbox: %w", err)
286286
}
287-
defer sourceSbx.Stop(ctx)
287+
defer sourceSbx.Close(ctx)
288288

289289
err = sourceSbx.WaitForEnvd(
290290
ctx,

packages/orchestrator/internal/template/build/phases/base/provision.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func (bb *BaseBuilder) provisionSandbox(
9999
if err != nil {
100100
return fmt.Errorf("error creating sandbox: %w", err)
101101
}
102-
defer sbx.Stop(ctx)
102+
defer sbx.Close(ctx)
103103

104104
err = sbx.WaitForExit(
105105
ctx,

packages/shared/pkg/utils/set_once.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ func (s *SetOnce[T]) SetError(err error) error {
4444
return s.setResult(result[T]{err: err})
4545
}
4646

47+
func (s *SetOnce[T]) SetResult(value T, err error) error {
48+
if err != nil {
49+
return s.SetError(err)
50+
}
51+
return s.SetValue(value)
52+
}
53+
4754
var ErrAlreadySet = fmt.Errorf("value already set")
4855

4956
// SetResult internal method for setting the result only once.

0 commit comments

Comments
 (0)