riverqueue · brandur · Nov 25, 2025 · bgentry · Dec 3, 2025 · brandur
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Basic stuck detection after a job's exceeded its timeout and still not returned after the executor's initiated context cancellation and waited a short margin for the cancellation to take effect. [PR #1097](https://github.com/riverqueue/river/pull/1097).
+
 ## [0.29.0-rc.1] - 2025-12-04
 
 - Added `HookPeriodicJobsStart` that can be used to run custom logic when a periodic job enqueuer starts up on a new leader. [PR #1084](https://github.com/riverqueue/river/pull/1084).

diff --git a/internal/jobexecutor/job_executor.go b/internal/jobexecutor/job_executor.go
@@ -112,12 +112,17 @@ type JobExecutor struct {
 	ErrorHandler             ErrorHandler
 	HookLookupByJob          *hooklookup.JobHookLookup
 	HookLookupGlobal         hooklookup.HookLookupInterface
-	InformProducerDoneFunc   func(jobRow *rivertype.JobRow)
 	JobRow                   *rivertype.JobRow
 	MiddlewareLookupGlobal   middlewarelookup.MiddlewareLookupInterface
-	SchedulerInterval        time.Duration
-	WorkerMiddleware         []rivertype.WorkerMiddleware
-	WorkUnit                 workunit.WorkUnit
+	ProducerCallbacks        struct {
+		JobDone func(jobRow *rivertype.JobRow)
+		Stuck   func()
+		Unstuck func()
+	}
+	SchedulerInterval      time.Duration
+	StuckThresholdOverride time.Duration
+	WorkerMiddleware       []rivertype.WorkerMiddleware
+	WorkUnit               workunit.WorkUnit
 
 	// Meant to be used from within the job executor only.
 	start time.Time
@@ -159,7 +164,7 @@ func (e *JobExecutor) Execute(ctx context.Context) {
 		}
 	}
 
-	e.InformProducerDoneFunc(e.JobRow)
+	e.ProducerCallbacks.JobDone(e.JobRow)
 }
 
 // Executes the job, handling a panic if necessary (and various other error
@@ -171,6 +176,57 @@ func (e *JobExecutor) execute(ctx context.Context) (res *jobExecutorResult) {
 	metadataUpdates := make(map[string]any)
 	ctx = context.WithValue(ctx, ContextKeyMetadataUpdates, metadataUpdates)
 
+	// Watches for jobs that may have become stuck. i.e. They've run longer than
+	// their job timeout (plus a small margin) and don't appear to be responding
+	// to context cancellation (unfortunately, quite an easy error to make in
+	// Go).
+	//
+	// Currently we don't do anything if we notice a job is stuck. Knowing about
+	// stuck jobs is just used for informational purposes in the producer in
+	// generating periodic stats.
+	if e.ClientJobTimeout > 0 {
+		// We add a WithoutCancel here so that this inner goroutine becomes
+		// immune to all context cancellations _except_ the one where it's
+		// cancelled because we leave JobExecutor.execute.
+		ctx, cancel := context.WithCancel(context.WithoutCancel(ctx))
+		defer cancel()
+
+		go func() {
+			const stuckThresholdDefault = 5 * time.Second
+
+			select {
+			case <-ctx.Done():
+				// context cancelled as we leave JobExecutor.execute
+
+			case <-time.After(e.ClientJobTimeout + cmp.Or(e.StuckThresholdOverride, stuckThresholdDefault)):
+				e.ProducerCallbacks.Stuck()
+
+				e.Logger.WarnContext(ctx, e.Name+": Job appears to be stuck",
+					slog.Int64("job_id", e.JobRow.ID),
+					slog.String("kind", e.JobRow.Kind),
+					slog.Duration("timeout", e.ClientJobTimeout),
+				)
+
+				// context cancelled as we leave JobExecutor.execute
+				<-ctx.Done()
+
+				// In case the executor ever becomes unstuck, inform the
+				// producer. However, if we got all the way here there's a good
+				// chance this will never happen (the worker is really stuck and
+				// will never return).
+				defer e.ProducerCallbacks.Unstuck()
+
+				defer func() {
+					e.Logger.InfoContext(ctx, e.Name+": Job became unstuck",
+						slog.Duration("duration", time.Since(e.start)),
+						slog.Int64("job_id", e.JobRow.ID),
+						slog.String("kind", e.JobRow.Kind),
+					)
+				}()
+			}
+		}()
+	}
+
 	defer func() {
 		if recovery := recover(); recovery != nil {
 			e.Logger.ErrorContext(ctx, e.Name+": panic recovery; possible bug with Worker",

diff --git a/internal/jobexecutor/job_executor_test.go b/internal/jobexecutor/job_executor_test.go
@@ -191,11 +191,19 @@ func TestJobExecutor_Execute(t *testing.T) {
 			ErrorHandler:             bundle.errorHandler,
 			HookLookupByJob:          hooklookup.NewJobHookLookup(),
 			HookLookupGlobal:         hooklookup.NewHookLookup(nil),
-			InformProducerDoneFunc:   func(job *rivertype.JobRow) {},
 			JobRow:                   bundle.jobRow,
 			MiddlewareLookupGlobal:   middlewarelookup.NewMiddlewareLookup(nil),
-			SchedulerInterval:        riverinternaltest.SchedulerShortInterval,
-			WorkUnit:                 workUnitFactory.MakeUnit(bundle.jobRow),
+			ProducerCallbacks: struct {
+				JobDone func(jobRow *rivertype.JobRow)
+				Stuck   func()
+				Unstuck func()
+			}{
+				JobDone: func(jobRow *rivertype.JobRow) {},
+				Stuck:   func() {},
+				Unstuck: func() {},
+			},
+			SchedulerInterval: riverinternaltest.SchedulerShortInterval,
+			WorkUnit:          workUnitFactory.MakeUnit(bundle.jobRow),
 		})
 
 		return executor, bundle
@@ -696,6 +704,94 @@ func TestJobExecutor_Execute(t *testing.T) {
 		})
 	})
 
+	configureStuckDetection := func(executor *JobExecutor) {
+		executor.ClientJobTimeout = 5 * time.Millisecond
+		executor.StuckThresholdOverride = 1 * time.Nanosecond // must be greater than 0 to take effect
+	}
+
+	t.Run("StuckDetectionActivates", func(t *testing.T) {
+		t.Parallel()
+
+		executor, bundle := setup(t)
+
+		configureStuckDetection(executor)
+
+		var (
+			informProducerStuckReceived   = make(chan struct{})
+			informProducerUnstuckReceived = make(chan struct{})
+		)
+		executor.ProducerCallbacks.Stuck = func() {
+			t.Log("Job executor reported stuck")
+			close(informProducerStuckReceived)
+		}
+		executor.ProducerCallbacks.Unstuck = func() {
+			t.Log("Job executor reported unstuck (after being stuck)")
+			close(informProducerUnstuckReceived)
+		}
+
+		executor.WorkUnit = newWorkUnitFactoryWithCustomRetry(func() error {
+			riversharedtest.WaitOrTimeout(t, informProducerStuckReceived)
+
+			select {
+			case <-informProducerUnstuckReceived:
+				require.FailNow(t, "Executor should not have reported unstuck immediately")
+			case <-time.After(10 * time.Millisecond):
+				t.Log("Job executor still stuck after wait (this is expected)")
+			}
+
+			return nil
+		}, nil).MakeUnit(bundle.jobRow)
+
+		executor.Execute(ctx)
+		_ = riversharedtest.WaitOrTimeout(t, bundle.updateCh)
+
+		riversharedtest.WaitOrTimeout(t, informProducerUnstuckReceived)
+	})
+
+	// Checks that even if a work's context is cancelled immediately, stuck
+	// detection still works as expected.
+	t.Run("StuckDetectionIgnoresParentContextCancellation", func(t *testing.T) {
+		t.Parallel()
+
+		executor, bundle := setup(t)
+
+		configureStuckDetection(executor)
+
+		var (
+			informProducerStuckReceived   = make(chan struct{})
+			informProducerUnstuckReceived = make(chan struct{})
+		)
+		executor.ProducerCallbacks.Stuck = func() {
+			t.Log("Job executor reported stuck")
+			close(informProducerStuckReceived)
+		}
+		executor.ProducerCallbacks.Unstuck = func() {
+			t.Log("Job executor reported unstuck (after being stuck)")
+			close(informProducerUnstuckReceived)
+		}
+
+		executor.WorkUnit = newWorkUnitFactoryWithCustomRetry(func() error {
+			riversharedtest.WaitOrTimeout(t, informProducerStuckReceived)
+
+			select {
+			case <-informProducerUnstuckReceived:
+				require.FailNow(t, "Executor should not have reported unstuck immediately")
+			case <-time.After(10 * time.Millisecond):
+				t.Log("Job executor still stuck after wait (this is expected)")
+			}
+
+			return nil
+		}, nil).MakeUnit(bundle.jobRow)
+
+		ctx, cancel := context.WithCancel(ctx)
+		cancel() // cancel immediately
+
+		executor.Execute(ctx)
+		_ = riversharedtest.WaitOrTimeout(t, bundle.updateCh)
+
+		riversharedtest.WaitOrTimeout(t, informProducerUnstuckReceived)
+	})
+
 	t.Run("Panic", func(t *testing.T) {
 		t.Parallel()
 

diff --git a/producer.go b/producer.go
@@ -209,6 +209,7 @@ type producer struct {
 	// An atomic count of the number of jobs actively being worked on. This is
 	// written to by the main goroutine, but read by the dispatcher.
 	numJobsActive atomic.Int32
+	numJobsStuck  atomic.Int32
 
 	numJobsRan atomic.Uint64
 	paused     bool
@@ -771,20 +772,26 @@ func (p *producer) heartbeatLogLoop(ctx context.Context, wg *sync.WaitGroup) {
 	ticker := time.NewTicker(5 * time.Second)
 	defer ticker.Stop()
 	type jobCount struct {
-		ran    uint64
 		active int
+		ran    uint64
+		stuck  int
 	}
 	var prevCount jobCount
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
-			curCount := jobCount{ran: p.numJobsRan.Load(), active: int(p.numJobsActive.Load())}
+			curCount := jobCount{
+				active: int(p.numJobsActive.Load()),
+				ran:    p.numJobsRan.Load(),
+				stuck:  int(p.numJobsStuck.Load()),
+			}
 			if curCount != prevCount {
 				p.Logger.InfoContext(ctx, p.Name+": Producer job counts",
 					slog.Uint64("num_completed_jobs", curCount.ran),
 					slog.Int("num_jobs_running", curCount.active),
+					slog.Int("num_jobs_stuck", curCount.stuck),
 					slog.String("queue", p.config.Queue),
 				)
 			}
@@ -815,10 +822,18 @@ func (p *producer) startNewExecutors(workCtx context.Context, jobs []*rivertype.
 			HookLookupByJob:          p.config.HookLookupByJob,
 			HookLookupGlobal:         p.config.HookLookupGlobal,
 			MiddlewareLookupGlobal:   p.config.MiddlewareLookupGlobal,
-			InformProducerDoneFunc:   p.handleWorkerDone,
 			JobRow:                   job,
-			SchedulerInterval:        p.config.SchedulerInterval,
-			WorkUnit:                 workUnit,
+			ProducerCallbacks: struct {
+				JobDone func(jobRow *rivertype.JobRow)
+				Stuck   func()
+				Unstuck func()
+			}{
+				JobDone: p.handleWorkerDone,
+				Stuck:   func() { p.numJobsStuck.Add(1) },
+				Unstuck: func() { p.numJobsStuck.Add(-1) },
+			},
+			SchedulerInterval: p.config.SchedulerInterval,
+			WorkUnit:          workUnit,
 		})
 		p.addActiveJob(job.ID, executor)
 

diff --git a/rivertest/worker.go b/rivertest/worker.go
@@ -203,13 +203,21 @@ func (w *Worker[T, TTx]) workJob(ctx context.Context, tb testing.TB, tx TTx, job
 				return nil
 			},
 		},
-		InformProducerDoneFunc: func(job *rivertype.JobRow) { close(executionDone) },
 		HookLookupGlobal:       hooklookup.NewHookLookup(w.config.Hooks),
 		HookLookupByJob:        hooklookup.NewJobHookLookup(),
 		JobRow:                 job,
 		MiddlewareLookupGlobal: middlewarelookup.NewMiddlewareLookup(w.config.Middleware),
-		SchedulerInterval:      maintenance.JobSchedulerIntervalDefault,
-		WorkUnit:               workUnit,
+		ProducerCallbacks: struct {
+			JobDone func(jobRow *rivertype.JobRow)
+			Stuck   func()
+			Unstuck func()
+		}{
+			JobDone: func(job *rivertype.JobRow) { close(executionDone) },
+			Stuck:   func() {},
+			Unstuck: func() {},
+		},
+		SchedulerInterval: maintenance.JobSchedulerIntervalDefault,
+		WorkUnit:          workUnit,
 	})
 
 	executor.Execute(jobCtx)