Skip to content

Commit 58630e1

Browse files
committed
Merge pull request #55794 from rails/fix-55513
[Fix #55513] parallel tests hanging when worker processes die abruptly
1 parent 8ee0169 commit 58630e1

File tree

5 files changed

+58
-5
lines changed

5 files changed

+58
-5
lines changed

activesupport/CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
* Fix parallel tests hanging when worker processes die abruptly.
2+
3+
Previously, if a worker process was killed (e.g., OOM killed, `kill -9`) during parallel
4+
test execution, the test suite would hang forever waiting for the dead worker.
5+
6+
*Joshua Young*
7+
18
* `ActiveSupport::FileUpdateChecker` does not depend on `Time.now` to prevent unnecessary reloads with time travel test helpers
29

310
*Jan Grodowski*

activesupport/lib/active_support/testing/parallelization.rb

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,19 @@ def size
4747
end
4848

4949
def shutdown
50+
dead_worker_pids = @worker_pool.filter_map do |pid|
51+
Process.waitpid(pid, Process::WNOHANG)
52+
rescue Errno::ECHILD
53+
pid
54+
end
55+
@queue_server.remove_dead_workers(dead_worker_pids)
56+
5057
@queue_server.shutdown
51-
@worker_pool.each { |pid| Process.waitpid pid }
58+
@worker_pool.each do |pid|
59+
Process.waitpid(pid)
60+
rescue Errno::ECHILD
61+
nil
62+
end
5263
end
5364
end
5465
end

activesupport/lib/active_support/testing/parallelization/server.rb

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class Server
1414
def initialize
1515
@queue = Queue.new
1616
@active_workers = Concurrent::Map.new
17+
@worker_pids = Concurrent::Map.new
1718
@in_flight = Concurrent::Map.new
1819
end
1920

@@ -40,12 +41,24 @@ def pop
4041
end
4142
end
4243

43-
def start_worker(worker_id)
44+
def start_worker(worker_id, worker_pid)
4445
@active_workers[worker_id] = true
46+
@worker_pids[worker_id] = worker_pid
4547
end
4648

47-
def stop_worker(worker_id)
49+
def stop_worker(worker_id, worker_pid)
4850
@active_workers.delete(worker_id)
51+
@worker_pids.delete(worker_id)
52+
end
53+
54+
def remove_dead_workers(dead_pids)
55+
dead_pids.each do |dead_pid|
56+
worker_id = @worker_pids.key(dead_pid)
57+
if worker_id
58+
@active_workers.delete(worker_id)
59+
@worker_pids.delete(worker_id)
60+
end
61+
end
4962
end
5063

5164
def active_workers?

activesupport/lib/active_support/testing/parallelization/worker.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def start
1818
DRb.stop_service
1919

2020
@queue = DRbObject.new_with_uri(@url)
21-
@queue.start_worker(@id)
21+
@queue.start_worker(@id, Process.pid)
2222

2323
begin
2424
after_fork
@@ -29,7 +29,7 @@ def start
2929
set_process_title("(stopping)")
3030

3131
run_cleanup
32-
@queue.stop_worker(@id)
32+
@queue.stop_worker(@id, Process.pid)
3333
end
3434
end
3535

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# frozen_string_literal: true
2+
3+
require_relative "abstract_unit"
4+
5+
class ParallelizationTest < ActiveSupport::TestCase
6+
test "shutdown handles dead workers gracefully" do
7+
parallelization = ActiveSupport::Testing::Parallelization.new(1)
8+
parallelization.start
9+
10+
sleep 0.25
11+
12+
server = parallelization.instance_variable_get(:@queue_server)
13+
assert server.active_workers?
14+
15+
worker_pids = parallelization.instance_variable_get(:@worker_pool)
16+
Process.kill("KILL", worker_pids.first)
17+
sleep 0.25
18+
19+
Timeout.timeout(2.5, Minitest::Assertion, "Expected shutdown to not hang") { parallelization.shutdown }
20+
assert_not server.active_workers?
21+
end
22+
end

0 commit comments

Comments
 (0)