manager: fix task scheduler infinite loop

corhere · xinfengliu · corhere · commit 2d6aff77626a · 2025-05-28T16:45:13.000-04:00
If the running tasks for a service are not well balanced across the
placement-preference tree, the task scheduler could enter an infinite
loop when scaling the service up. The scheduleNTasksOnSubtree loop
terminates when either all tasks have been scheduled onto nodes, or the
nodes in all subtrees are out of room to accept new tasks. The trouble
is that the algorithm only considers a subtree to be out of room if an
attempt was made to schedule tasks onto its nodes but not all tasks were
scheduled. Subtrees with more tasks already running than the desired
number of tasks for a balanced tree are skipped over without attempting
to assign any tasks, so do not have a chance to be considered out of
room. The scheduler will therefore enter a tight infinite loop when
there exists a node of the placement-preferences tree in which at least
one subtree has more tasks running than desired, and all other subtrees
are out of room for more tasks.

It would be incorrect to consider a subtree as out of room just because
there are more tasks running than desired at a particular iteration of
the scheduling loop. The desired number of tasks to assign changes as
the scheduler iteratively schedules tasks and other subtrees run out of
room, so it is possible for a subtree to become eligible in a future
iteration.

Add a third condition to the task scheduler loop. Make it so the loop
exits if there are no subtrees which are eligible for task scheduling,
whether due to being out of room or have more tasks running than
desired.

Co-authored-by: Xinfeng Liu &lt;XinfengLiu@icloud.com&gt;
Signed-off-by: Cory Snider &lt;csnider@mirantis.com&gt;
diff --git a/manager/scheduler/nodeset_test.go b/manager/scheduler/nodeset_test.go
@@ -0,0 +1,163 @@
+package scheduler
+
+import (
+	"testing"
+
+	"github.com/moby/swarmkit/v2/api"
+)
+
+func TestTreeTaskCountConsistency(t *testing.T) {
+	// Create a nodeSet with some test nodes
+	ns := &nodeSet{nodes: make(map[string]NodeInfo)}
+
+	// Add test nodes with different labels and task counts
+	nodes := []NodeInfo{
+		{
+			Node: &api.Node{
+				ID: "node1",
+				Spec: api.NodeSpec{
+					Annotations: api.Annotations{
+						Labels: map[string]string{"datacenter": "dc1", "rack": "r1"},
+					},
+				},
+			},
+			ActiveTasksCountByService: map[string]int{"service1": 3},
+		},
+		{
+			Node: &api.Node{
+				ID: "node2",
+				Spec: api.NodeSpec{
+					Annotations: api.Annotations{
+						Labels: map[string]string{"datacenter": "dc1", "rack": "r2"},
+					},
+				},
+			},
+			ActiveTasksCountByService: map[string]int{"service1": 2},
+		},
+		{
+			Node: &api.Node{
+				ID: "node3",
+				Spec: api.NodeSpec{
+					Annotations: api.Annotations{
+						Labels: map[string]string{"datacenter": "dc2", "rack": "r2"},
+					},
+				},
+			},
+			ActiveTasksCountByService: map[string]int{"service1": 4},
+		},
+		{
+			Node: &api.Node{
+				ID: "node4",
+				Spec: api.NodeSpec{
+					Annotations: api.Annotations{
+						Labels: map[string]string{}, // no label
+					},
+				},
+			},
+			ActiveTasksCountByService: map[string]int{"service1": 2},
+		},
+		{
+			Node: &api.Node{
+				ID: "node5",
+				Spec: api.NodeSpec{
+					Annotations: api.Annotations{
+						Labels: map[string]string{}, // no label
+					},
+				},
+			},
+			ActiveTasksCountByService: map[string]int{"service1": 1},
+		},
+	}
+
+	for _, node := range nodes {
+		ns.addOrUpdateNode(node)
+	}
+
+	preferences := []*api.PlacementPreference{
+		{
+			Preference: &api.PlacementPreference_Spread{
+				Spread: &api.SpreadOver{
+					SpreadDescriptor: "node.labels.datacenter",
+				},
+			},
+		},
+		{
+			Preference: &api.PlacementPreference_Spread{
+				Spread: &api.SpreadOver{
+					SpreadDescriptor: "node.labels.rack",
+				},
+			},
+		},
+	}
+
+	// Create the tree
+	tree := ns.tree("service1", preferences, 10,
+		func(*NodeInfo) bool { return true },
+		func(a, b *NodeInfo) bool { return true })
+
+	// Helper function to verify task count consistency recursively
+	var verifyTaskCounts func(*testing.T, *decisionTree) int
+	verifyTaskCounts = func(t *testing.T, dt *decisionTree) int {
+		if dt == nil {
+			return 0
+		}
+
+		if dt.next == nil {
+			return dt.tasks
+		}
+
+		// Calculate sum of children's tasks
+		childrenSum := 0
+		for _, child := range dt.next {
+			childrenSum += verifyTaskCounts(t, child)
+		}
+
+		// Verify parent's task count equals sum of children
+		if dt.tasks != childrenSum {
+			t.Errorf("Parent task count (%d) does not equal sum of children (%d)",
+				dt.tasks, childrenSum)
+		}
+
+		return dt.tasks
+	}
+
+	// Run the verification
+	verifyTaskCounts(t, &tree)
+
+	// Verify specific expected values
+	if tree.tasks != 12 { // Total tasks: 3 + 2 + 4 + 2 + 1 = 12
+		t.Errorf("Expected root to have 12 tasks, got %d", tree.tasks)
+	}
+
+	dc1Tasks := tree.next["dc1"].tasks
+	if dc1Tasks != 5 { // dc1 tasks: 3 + 2 = 5
+		t.Errorf("Expected dc1 to have 5 tasks, got %d", dc1Tasks)
+	}
+	dc1r1Tasks := tree.next["dc1"].next["r1"].tasks
+	if dc1r1Tasks != 3 {
+		t.Errorf("Expected dc1 r1 to have 3 tasks, got %d", dc1r1Tasks)
+	}
+	dc1r2Tasks := tree.next["dc1"].next["r2"].tasks
+	if dc1r2Tasks != 2 {
+		t.Errorf("Expected dc1 r1 to have 2 tasks, got %d", dc1r2Tasks)
+	}
+
+	dc2Tasks := tree.next["dc2"].tasks
+	if dc2Tasks != 4 { // dc2 tasks: 4
+		t.Errorf("Expected dc2 to have 4 tasks, got %d", dc2Tasks)
+	}
+	dc2r2Tasks := tree.next["dc2"].next["r2"].tasks
+	if dc2r2Tasks != 4 {
+		t.Errorf("Expected dc1 r1 to have 4 tasks, got %d", dc1r2Tasks)
+	}
+
+	otherTasks := tree.next[""].tasks
+	if otherTasks != 3 {
+		t.Errorf("Expected others to have 3 tasks, got %d", otherTasks)
+	}
+	subOtherTasks := tree.next[""].next[""].tasks
+	if subOtherTasks != 3 {
+		t.Errorf("Expected sub-others to have 3 tasks, got %d", subOtherTasks)
+	}
+
+}
diff --git a/manager/scheduler/scheduler.go b/manager/scheduler/scheduler.go
@@ -787,9 +787,11 @@ func (s *Scheduler) scheduleNTasksOnSubtree(ctx context.Context, n int, taskGrou
 
 	// Try to make branches even until either all branches are
 	// full, or all tasks have been scheduled.
-	for tasksScheduled != n && len(noRoom) != len(tree.next) {
+	converging := true
+	for tasksScheduled != n && len(noRoom) != len(tree.next) && converging {
 		desiredTasksPerBranch := (tasksInUsableBranches + n - tasksScheduled) / (len(tree.next) - len(noRoom))
 		remainder := (tasksInUsableBranches + n - tasksScheduled) % (len(tree.next) - len(noRoom))
+		converging = false
 
 		for _, subtree := range tree.next {
 			if noRoom != nil {
@@ -799,6 +801,7 @@ func (s *Scheduler) scheduleNTasksOnSubtree(ctx context.Context, n int, taskGrou
 			}
 			subtreeTasks := subtree.tasks
 			if subtreeTasks < desiredTasksPerBranch || (subtreeTasks == desiredTasksPerBranch && remainder > 0) {
+				converging = true
 				tasksToAssign := desiredTasksPerBranch - subtreeTasks
 				if remainder > 0 {
 					tasksToAssign++
diff --git a/manager/scheduler/scheduler_test.go b/manager/scheduler/scheduler_test.go
@@ -1110,6 +1110,156 @@ func TestMultiplePreferences(t *testing.T) {
 	t.Run("useSpecVersion=true", func(t *testing.T) { testMultiplePreferences(t, true) })
 }
 
+// TestMultiplePreferencesScaleUp is a regression test for an infinite loop
+// bug in the scheduler.
+func TestMultiplePreferencesScaleUp(t *testing.T) {
+	ctx := context.Background()
+	initialNodeSet := []*api.Node{
+		{
+			ID: "id11",
+			Status: api.NodeStatus{
+				State: api.NodeStatus_READY,
+			},
+			Spec: api.NodeSpec{
+				Annotations: api.Annotations{
+					Labels: map[string]string{
+						"az":   "dc1",
+						"rack": "r1",
+					},
+				},
+			},
+		},
+		{
+			ID: "id12",
+			Status: api.NodeStatus{
+				State: api.NodeStatus_READY,
+			},
+			Spec: api.NodeSpec{
+				Annotations: api.Annotations{
+					Labels: map[string]string{
+						"az":   "dc1",
+						"rack": "r2",
+					},
+				},
+			},
+		},
+		{
+			ID: "id21",
+			Status: api.NodeStatus{
+				State: api.NodeStatus_READY,
+			},
+			Spec: api.NodeSpec{
+				Annotations: api.Annotations{
+					Labels: map[string]string{
+						"az":   "dc2",
+						"rack": "r1",
+					},
+				},
+			},
+		},
+	}
+
+	taskTemplate1 := &api.Task{
+		DesiredState: api.TaskStateRunning,
+		ServiceID:    "service1",
+		// The service needs to have a spec version to be scheduled as a
+		// group, a necessary precondition for the scheduler
+		// infinite-loop bug.
+		SpecVersion: &api.Version{Index: 1},
+		Spec: api.TaskSpec{
+			Runtime: &api.TaskSpec_Container{
+				Container: &api.ContainerSpec{
+					Image: "v:1",
+				},
+			},
+			Placement: &api.Placement{
+				Preferences: []*api.PlacementPreference{
+					{
+						Preference: &api.PlacementPreference_Spread{
+							Spread: &api.SpreadOver{
+								SpreadDescriptor: "node.labels.az",
+							},
+						},
+					},
+					{
+						Preference: &api.PlacementPreference_Spread{
+							Spread: &api.SpreadOver{
+								SpreadDescriptor: "node.labels.rack",
+							},
+						},
+					},
+				},
+			},
+		},
+		Status: api.TaskStatus{
+			State: api.TaskStatePending,
+		},
+	}
+
+	s := store.NewMemoryStore(nil)
+	assert.NotNil(t, s)
+	defer s.Close()
+
+	t1Instances := 2
+
+	err := s.Update(func(tx store.Tx) error {
+		// Prepoulate nodes
+		for _, n := range initialNodeSet {
+			assert.NoError(t, store.CreateNode(tx, n))
+		}
+
+		// Prepopulate tasks from template 1
+		for i := 0; i != t1Instances; i++ {
+			taskTemplate1.ID = fmt.Sprintf("t1id%d", i)
+			assert.NoError(t, store.CreateTask(tx, taskTemplate1))
+		}
+
+		// Populate some running tasks to simulate a service scaling scenario
+		for node, tasks := range map[string]int{
+			"id11": 3,
+			"id12": 1,
+			"id21": 3,
+		} {
+			for i := 0; i != tasks; i++ {
+				taskTemplate1.ID = fmt.Sprintf("t1running-%s-%d", node, i)
+				taskTemplate1.NodeID = node
+				taskTemplate1.Status.State = api.TaskStateRunning
+				assert.NoError(t, store.CreateTask(tx, taskTemplate1))
+			}
+		}
+		return nil
+	})
+	assert.NoError(t, err)
+
+	scheduler := New(s)
+
+	watch, cancel := state.Watch(s.WatchQueue(), api.EventUpdateTask{})
+	defer cancel()
+
+	go func() {
+		assert.NoError(t, scheduler.Run(ctx))
+	}()
+	defer scheduler.Stop()
+
+	t1Assignments := make(map[string]int)
+	totalAssignments := 0
+	for i := 0; i != t1Instances; i++ {
+		assignment := watchAssignment(t, watch)
+		if !strings.HasPrefix(assignment.ID, "t1") {
+			t.Fatal("got assignment for different kind of task")
+		}
+		t1Assignments[assignment.NodeID]++
+		totalAssignments++
+	}
+
+	t.Logf("t1Assignments: %#v", t1Assignments)
+	assert.Equal(t, t1Instances, totalAssignments)
+	// It would be valid for the scheduler either assign the tasks to id12,
+	// which balances r1 and r2 of dc1, or assign the tasks to id21, which
+	// balances dc1 and dc2.
+	assert.Equal(t, 2, t1Assignments["id12"]+t1Assignments["id21"])
+}
+
 func TestSchedulerNoReadyNodes(t *testing.T) {
 	ctx := context.Background()
 	initialTask := &api.Task{
@@ -2698,6 +2848,7 @@ func watchAssignmentFailure(t *testing.T, watch chan events.Event) *api.Task {
 }
 
 func watchAssignment(t *testing.T, watch chan events.Event) *api.Task {
+	t.Helper()
 	for {
 		select {
 		case event := <-watch: