Basic stuck job detection

brandur · brandur · commit 74f2aeb0ba98 · 2025-11-25T01:33:29.000-08:00
Here, try to make some inroads on a feature we've been talking about for
a while: detection of stuck jobs.

Unfortunately in Go it's quite easy to accidentally park a job by using
a `select` on a channel that won't return and forgetting a separate
branch for `&lt;-ctx.Done()` so that it won't respect job timeouts either.

Here, add in some basic detection for that case. Eventually we'd like to
give users some options for what to do in case jobs become stuck, but
here we do only the simplest things for now: log when we detect a stuck
job and count the number of stuck jobs in a producer's stats loop.

In the future we may want to have some additional intelligence like
having producers move stuck jobs to a separate bucket up to a certain
limit before crashing (the next best option because it's not possible to
manually kill goroutines).
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Basic stuck detection after a job's exceeded its timeout and still not returned after the executor's initiated context cancellation and waited a short margin for the cancellation to take effect. [PR #1097](https://github.com/riverqueue/river/pull/1097).
+
 ## [0.28.0] - 2025-11-23
 
 ### Added
diff --git a/internal/jobexecutor/job_executor.go b/internal/jobexecutor/job_executor.go
@@ -112,12 +112,17 @@ type JobExecutor struct {
 	ErrorHandler             ErrorHandler
 	HookLookupByJob          *hooklookup.JobHookLookup
 	HookLookupGlobal         hooklookup.HookLookupInterface
-	InformProducerDoneFunc   func(jobRow *rivertype.JobRow)
 	JobRow                   *rivertype.JobRow
 	MiddlewareLookupGlobal   middlewarelookup.MiddlewareLookupInterface
-	SchedulerInterval        time.Duration
-	WorkerMiddleware         []rivertype.WorkerMiddleware
-	WorkUnit                 workunit.WorkUnit
+	ProducerCallbacks        struct {
+		Done    func(jobRow *rivertype.JobRow)
+		Stuck   func()
+		Unstuck func()
+	}
+	SchedulerInterval      time.Duration
+	StuckThresholdOverride time.Duration
+	WorkerMiddleware       []rivertype.WorkerMiddleware
+	WorkUnit               workunit.WorkUnit
 
 	// Meant to be used from within the job executor only.
 	start time.Time
@@ -159,7 +164,7 @@ func (e *JobExecutor) Execute(ctx context.Context) {
 		}
 	}
 
-	e.InformProducerDoneFunc(e.JobRow)
+	e.ProducerCallbacks.Done(e.JobRow)
 }
 
 // Executes the job, handling a panic if necessary (and various other error
@@ -171,6 +176,51 @@ func (e *JobExecutor) execute(ctx context.Context) (res *jobExecutorResult) {
 	metadataUpdates := make(map[string]any)
 	ctx = context.WithValue(ctx, ContextKeyMetadataUpdates, metadataUpdates)
 
+	// Watches for jobs that may have become stuck. i.e. They've run longer than
+	// their job timeout (plus a small margin) and don't appear to be responding
+	// to context cancellation (unfortunately, quite an easy error to make in
+	// Go).
+	//
+	// Currently we don't do anything if we notice a job is stuck. Knowing about
+	// stuck jobs is just used for informational purposes in the producer in
+	// generating periodic stats.
+	if e.ClientJobTimeout > 0 {
+		ctx, cancel := context.WithCancel(ctx)
+		defer cancel()
+
+		go func() {
+			const stuckThresholdDefault = 5 * time.Second
+
+			select {
+			case <-ctx.Done():
+				// cancellation or execution finished
+
+			case <-time.After(e.ClientJobTimeout + cmp.Or(e.StuckThresholdOverride, stuckThresholdDefault)):
+				e.ProducerCallbacks.Stuck()
+
+				e.Logger.WarnContext(ctx, e.Name+": Job appears to be stuck",
+					slog.Int64("job_id", e.JobRow.ID),
+					slog.String("kind", e.JobRow.Kind),
+					slog.Duration("timeout", e.ClientJobTimeout),
+				)
+
+				// In case the executor ever becomes unstuck, inform the
+				// producer. However, if we got all the way here there's a good
+				// chance this will never happen (the worker is really stuck and
+				// will never return).
+				defer e.ProducerCallbacks.Unstuck()
+
+				defer func() {
+					e.Logger.InfoContext(ctx, e.Name+": Job became unstuck",
+						slog.Duration("duration", time.Since(e.start)),
+						slog.Int64("job_id", e.JobRow.ID),
+						slog.String("kind", e.JobRow.Kind),
+					)
+				}()
+			}
+		}()
+	}
+
 	defer func() {
 		if recovery := recover(); recovery != nil {
 			e.Logger.ErrorContext(ctx, e.Name+": panic recovery; possible bug with Worker",
diff --git a/internal/jobexecutor/job_executor_test.go b/internal/jobexecutor/job_executor_test.go
@@ -191,11 +191,19 @@ func TestJobExecutor_Execute(t *testing.T) {
 			ErrorHandler:             bundle.errorHandler,
 			HookLookupByJob:          hooklookup.NewJobHookLookup(),
 			HookLookupGlobal:         hooklookup.NewHookLookup(nil),
-			InformProducerDoneFunc:   func(job *rivertype.JobRow) {},
 			JobRow:                   bundle.jobRow,
 			MiddlewareLookupGlobal:   middlewarelookup.NewMiddlewareLookup(nil),
-			SchedulerInterval:        riverinternaltest.SchedulerShortInterval,
-			WorkUnit:                 workUnitFactory.MakeUnit(bundle.jobRow),
+			ProducerCallbacks: struct {
+				Done    func(jobRow *rivertype.JobRow)
+				Stuck   func()
+				Unstuck func()
+			}{
+				Done:    func(jobRow *rivertype.JobRow) {},
+				Stuck:   func() {},
+				Unstuck: func() {},
+			},
+			SchedulerInterval: riverinternaltest.SchedulerShortInterval,
+			WorkUnit:          workUnitFactory.MakeUnit(bundle.jobRow),
 		})
 
 		return executor, bundle
@@ -696,6 +704,36 @@ func TestJobExecutor_Execute(t *testing.T) {
 		})
 	})
 
+	t.Run("StuckDetection", func(t *testing.T) {
+		t.Parallel()
+
+		executor, bundle := setup(t)
+
+		executor.ClientJobTimeout = 5 * time.Millisecond
+		executor.StuckThresholdOverride = 1 * time.Nanosecond // must be greater than 0 to take effect
+
+		var (
+			informProducerStuckReceived   = make(chan struct{})
+			informProducerUnstuckReceived = make(chan struct{})
+		)
+		executor.ProducerCallbacks.Stuck = func() {
+			close(informProducerStuckReceived)
+		}
+		executor.ProducerCallbacks.Unstuck = func() {
+			close(informProducerUnstuckReceived)
+		}
+
+		executor.WorkUnit = newWorkUnitFactoryWithCustomRetry(func() error {
+			riversharedtest.WaitOrTimeout(t, informProducerStuckReceived)
+			return nil
+		}, nil).MakeUnit(bundle.jobRow)
+
+		executor.Execute(ctx)
+		_ = riversharedtest.WaitOrTimeout(t, bundle.updateCh)
+
+		riversharedtest.WaitOrTimeout(t, informProducerUnstuckReceived)
+	})
+
 	t.Run("Panic", func(t *testing.T) {
 		t.Parallel()
 
diff --git a/producer.go b/producer.go
@@ -209,6 +209,7 @@ type producer struct {
 	// An atomic count of the number of jobs actively being worked on. This is
 	// written to by the main goroutine, but read by the dispatcher.
 	numJobsActive atomic.Int32
+	numJobsStuck  atomic.Int32
 
 	numJobsRan atomic.Uint64
 	paused     bool
@@ -771,20 +772,26 @@ func (p *producer) heartbeatLogLoop(ctx context.Context, wg *sync.WaitGroup) {
 	ticker := time.NewTicker(5 * time.Second)
 	defer ticker.Stop()
 	type jobCount struct {
-		ran    uint64
 		active int
+		ran    uint64
+		stuck  int
 	}
 	var prevCount jobCount
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
-			curCount := jobCount{ran: p.numJobsRan.Load(), active: int(p.numJobsActive.Load())}
+			curCount := jobCount{
+				active: int(p.numJobsActive.Load()),
+				ran:    p.numJobsRan.Load(),
+				stuck:  int(p.numJobsStuck.Load()),
+			}
 			if curCount != prevCount {
 				p.Logger.InfoContext(ctx, p.Name+": Producer job counts",
 					slog.Uint64("num_completed_jobs", curCount.ran),
 					slog.Int("num_jobs_running", curCount.active),
+					slog.Int("num_jobs_stuck", curCount.stuck),
 					slog.String("queue", p.config.Queue),
 				)
 			}
@@ -815,10 +822,18 @@ func (p *producer) startNewExecutors(workCtx context.Context, jobs []*rivertype.
 			HookLookupByJob:          p.config.HookLookupByJob,
 			HookLookupGlobal:         p.config.HookLookupGlobal,
 			MiddlewareLookupGlobal:   p.config.MiddlewareLookupGlobal,
-			InformProducerDoneFunc:   p.handleWorkerDone,
 			JobRow:                   job,
-			SchedulerInterval:        p.config.SchedulerInterval,
-			WorkUnit:                 workUnit,
+			ProducerCallbacks: struct {
+				Done    func(jobRow *rivertype.JobRow)
+				Stuck   func()
+				Unstuck func()
+			}{
+				Done:    p.handleWorkerDone,
+				Stuck:   func() { p.numJobsStuck.Add(1) },
+				Unstuck: func() { p.numJobsStuck.Add(-1) },
+			},
+			SchedulerInterval: p.config.SchedulerInterval,
+			WorkUnit:          workUnit,
 		})
 		p.addActiveJob(job.ID, executor)