Skip to content

Commit c9ac86d

Browse files
josephhzsfrothwell
authored andcommitted
ocfs2: fix deadlock between o2hb thread and o2net_wq
The following case may lead to o2net_wq and o2hb thread deadlock on o2hb_callback_sem. Currently there are 2 nodes say N1, N2 in the cluster. And N2 down, at the same time, N3 tries to join the cluster. So N1 will handle node down (N2) and join (N3) simultaneously. o2hb o2net_wq ->o2hb_do_disk_heartbeat ->o2hb_check_slot ->o2hb_run_event_list ->o2hb_fire_callbacks ->down_write(&o2hb_callback_sem) ->o2net_hb_node_down_cb ->flush_workqueue(o2net_wq) ->o2net_process_message ->dlm_query_join_handler ->o2hb_check_node_heartbeating ->o2hb_fill_node_map ->down_read(&o2hb_callback_sem) No need to take o2hb_callback_sem in dlm_query_join_handler, o2hb_live_lock is enough to protect live node map. Signed-off-by: Joseph Qi <[email protected]> Cc: xMark Fasheh <[email protected]> Cc: Joel Becker <[email protected]> Cc: jiangyiwen <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 17a535c commit c9ac86d

File tree

3 files changed

+21
-1
lines changed

3 files changed

+21
-1
lines changed

fs/ocfs2/cluster/heartbeat.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num)
25722572
}
25732573
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
25742574

2575+
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
2576+
{
2577+
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2578+
unsigned long flags;
2579+
2580+
spin_lock_irqsave(&o2hb_live_lock, flags);
2581+
o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
2582+
spin_unlock_irqrestore(&o2hb_live_lock, flags);
2583+
if (!test_bit(node_num, testing_map)) {
2584+
mlog(ML_HEARTBEAT,
2585+
"node (%u) does not have heartbeating enabled.\n",
2586+
node_num);
2587+
return 0;
2588+
}
2589+
2590+
return 1;
2591+
}
2592+
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
2593+
25752594
int o2hb_check_node_heartbeating_from_callback(u8 node_num)
25762595
{
25772596
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];

fs/ocfs2/cluster/heartbeat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map,
8080
void o2hb_exit(void);
8181
int o2hb_init(void);
8282
int o2hb_check_node_heartbeating(u8 node_num);
83+
int o2hb_check_node_heartbeating_no_sem(u8 node_num);
8384
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
8485
int o2hb_check_local_node_heartbeating(void);
8586
void o2hb_stop_all_regions(void);

fs/ocfs2/dlm/dlmdomain.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
839839
* to back off and try again. This gives heartbeat a chance
840840
* to catch up.
841841
*/
842-
if (!o2hb_check_node_heartbeating(query->node_idx)) {
842+
if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
843843
mlog(0, "node %u is not in our live map yet\n",
844844
query->node_idx);
845845

0 commit comments

Comments
 (0)