Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.

Commit 587d35f

Browse files
committed
panic when trying to dump & repair a block that we produced
1 parent b4fe128 commit 587d35f

File tree

2 files changed

+89
-12
lines changed

2 files changed

+89
-12
lines changed

core/src/ancestor_hashes_service.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,6 +1554,8 @@ mod test {
15541554
let ReplayBlockstoreComponents {
15551555
blockstore: requester_blockstore,
15561556
vote_simulator,
1557+
my_pubkey,
1558+
leader_schedule_cache,
15571559
..
15581560
} = setup_dead_slot(dead_slot, correct_bank_hashes);
15591561

@@ -1639,6 +1641,8 @@ mod test {
16391641
None,
16401642
&mut PurgeRepairSlotCounter::default(),
16411643
&dumped_slots_sender,
1644+
&my_pubkey,
1645+
&leader_schedule_cache,
16421646
);
16431647

16441648
// Simulate making a request

core/src/replay_stage.rs

Lines changed: 85 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,8 @@ impl ReplayStage {
919919
poh_bank.map(|bank| bank.slot()),
920920
&mut purge_repair_slot_counter,
921921
&dumped_slots_sender,
922+
&my_pubkey,
923+
&leader_schedule_cache,
922924
);
923925
dump_then_repair_correct_slots_time.stop();
924926

@@ -1162,6 +1164,7 @@ impl ReplayStage {
11621164
(progress, heaviest_subtree_fork_choice)
11631165
}
11641166

1167+
#[allow(clippy::too_many_arguments)]
11651168
pub fn dump_then_repair_correct_slots(
11661169
duplicate_slots_to_repair: &mut DuplicateSlotsToRepair,
11671170
ancestors: &mut HashMap<Slot, HashSet<Slot>>,
@@ -1172,6 +1175,8 @@ impl ReplayStage {
11721175
poh_bank_slot: Option<Slot>,
11731176
purge_repair_slot_counter: &mut PurgeRepairSlotCounter,
11741177
dumped_slots_sender: &DumpedSlotsSender,
1178+
my_pubkey: &Pubkey,
1179+
leader_schedule_cache: &LeaderScheduleCache,
11751180
) {
11761181
if duplicate_slots_to_repair.is_empty() {
11771182
return;
@@ -1180,10 +1185,10 @@ impl ReplayStage {
11801185
let root_bank = bank_forks.read().unwrap().root_bank();
11811186
let mut dumped = vec![];
11821187
// TODO: handle if alternate version of descendant also got confirmed after ancestor was
1183-
// confirmed, what happens then? Should probably keep track of purged list and skip things
1184-
// in `duplicate_slots_to_repair` that have already been purged. Add test.
1188+
// confirmed, what happens then? Should probably keep track of dumped list and skip things
1189+
// in `duplicate_slots_to_repair` that have already been dumped. Add test.
11851190
duplicate_slots_to_repair.retain(|duplicate_slot, correct_hash| {
1186-
// Should not purge duplicate slots if there is currently a poh bank building
1191+
// Should not dump duplicate slots if there is currently a poh bank building
11871192
// on top of that slot, as BankingStage might still be referencing/touching that state
11881193
// concurrently.
11891194
// Luckily for us, because the fork choice rule removes duplicate slots from fork
@@ -1203,13 +1208,13 @@ impl ReplayStage {
12031208
})
12041209
.unwrap_or(false);
12051210

1206-
let did_purge_repair = {
1211+
let did_dump_repair = {
12071212
if !is_poh_building_on_duplicate_fork {
12081213
let frozen_hash = bank_forks.read().unwrap().bank_hash(*duplicate_slot);
12091214
if let Some(frozen_hash) = frozen_hash {
12101215
if frozen_hash == *correct_hash {
12111216
warn!(
1212-
"Trying to purge slot {} with correct_hash {}",
1217+
"Trying to dump slot {} with correct_hash {}",
12131218
*duplicate_slot, *correct_hash
12141219
);
12151220
return false;
@@ -1219,19 +1224,29 @@ impl ReplayStage {
12191224
)
12201225
{
12211226
warn!(
1222-
"Trying to purge unfrozen slot {} that is not dead",
1227+
"Trying to dump unfrozen slot {} that is not dead",
12231228
*duplicate_slot
12241229
);
12251230
return false;
12261231
}
12271232
} else {
12281233
warn!(
1229-
"Trying to purge slot {} which does not exist in bank forks",
1234+
"Trying to dump slot {} which does not exist in bank forks",
12301235
*duplicate_slot
12311236
);
12321237
return false;
12331238
}
12341239

1240+
1241+
// Should not dump slots for which we were the leader
1242+
if Some(*my_pubkey) == leader_schedule_cache.slot_leader_at(*duplicate_slot, None) {
1243+
panic!("We are attempting to dump a block that we produced. \
1244+
This indicates that we are producing duplicate blocks, \
1245+
or that there is a bug in our runtime/replay code which \
1246+
causes us to compute different bank hashes than the rest of the cluster. \
1247+
We froze slot {duplicate_slot} with hash {frozen_hash:?} while the cluster hash is {correct_hash}");
1248+
}
1249+
12351250
Self::purge_unconfirmed_duplicate_slot(
12361251
*duplicate_slot,
12371252
ancestors,
@@ -1249,7 +1264,11 @@ impl ReplayStage {
12491264
.and_modify(|x| *x += 1)
12501265
.or_insert(1);
12511266
if *attempt_no > MAX_REPAIR_RETRY_LOOP_ATTEMPTS {
1252-
panic!("We have tried to repair duplicate slot: {} more than {} times and are unable to freeze a block with bankhash {}, instead we have a block with bankhash {:?}. This is most likely a bug in the runtime. At this point manual intervention is needed to make progress. Exiting", *duplicate_slot, MAX_REPAIR_RETRY_LOOP_ATTEMPTS, *correct_hash, frozen_hash);
1267+
panic!("We have tried to repair duplicate slot: {duplicate_slot} more than {MAX_REPAIR_RETRY_LOOP_ATTEMPTS} times \
1268+
and are unable to freeze a block with bankhash {correct_hash}, \
1269+
instead we have a block with bankhash {frozen_hash:?}. \
1270+
This is most likely a bug in the runtime. \
1271+
At this point manual intervention is needed to make progress. Exiting");
12531272
}
12541273
warn!(
12551274
"Notifying repair service to repair duplicate slot: {}, attempt {}",
@@ -1266,8 +1285,8 @@ impl ReplayStage {
12661285
}
12671286
};
12681287

1269-
// If we purged/repaired, then no need to keep the slot in the set of pending work
1270-
!did_purge_repair
1288+
// If we dumped/repaired, then no need to keep the slot in the set of pending work
1289+
!did_dump_repair
12711290
});
12721291

12731292
// Notify repair of the dumped slots along with the correct hash
@@ -3674,9 +3693,9 @@ pub(crate) mod tests {
36743693
pub struct ReplayBlockstoreComponents {
36753694
pub blockstore: Arc<Blockstore>,
36763695
validator_node_to_vote_keys: HashMap<Pubkey, Pubkey>,
3677-
my_pubkey: Pubkey,
3696+
pub(crate) my_pubkey: Pubkey,
36783697
cluster_info: ClusterInfo,
3679-
leader_schedule_cache: Arc<LeaderScheduleCache>,
3698+
pub(crate) leader_schedule_cache: Arc<LeaderScheduleCache>,
36803699
poh_recorder: RwLock<PohRecorder>,
36813700
tower: Tower,
36823701
rpc_subscriptions: Arc<RpcSubscriptions>,
@@ -6037,6 +6056,7 @@ pub(crate) mod tests {
60376056
let ReplayBlockstoreComponents {
60386057
ref mut vote_simulator,
60396058
ref blockstore,
6059+
ref leader_schedule_cache,
60406060
..
60416061
} = replay_blockstore_components(Some(forks), 1, None);
60426062

@@ -6073,6 +6093,8 @@ pub(crate) mod tests {
60736093
None,
60746094
&mut purge_repair_slot_counter,
60756095
&dumped_slots_sender,
6096+
&Pubkey::new_unique(),
6097+
leader_schedule_cache,
60766098
);
60776099
assert_eq!(should_be_dumped, dumped_slots_receiver.recv().ok().unwrap());
60786100

@@ -6128,6 +6150,7 @@ pub(crate) mod tests {
61286150
ref mut tower,
61296151
ref blockstore,
61306152
ref mut vote_simulator,
6153+
ref leader_schedule_cache,
61316154
..
61326155
} = replay_components;
61336156

@@ -6192,6 +6215,8 @@ pub(crate) mod tests {
61926215
None,
61936216
&mut PurgeRepairSlotCounter::default(),
61946217
&dumped_slots_sender,
6218+
&Pubkey::new_unique(),
6219+
leader_schedule_cache,
61956220
);
61966221

61976222
// Check everything was purged properly
@@ -7031,6 +7056,54 @@ pub(crate) mod tests {
70317056
assert_eq!(received_slots, vec![8, 9, 11]);
70327057
}
70337058

7059+
#[test]
7060+
#[should_panic(expected = "We are attempting to dump a block that we produced")]
7061+
fn test_dump_own_slots_fails() {
7062+
// Create the tree of banks in a BankForks object
7063+
let forks = tr(0) / (tr(1)) / (tr(2));
7064+
7065+
let ReplayBlockstoreComponents {
7066+
ref mut vote_simulator,
7067+
ref blockstore,
7068+
ref my_pubkey,
7069+
ref leader_schedule_cache,
7070+
..
7071+
} = replay_blockstore_components(Some(forks), 1, None);
7072+
7073+
let VoteSimulator {
7074+
ref mut progress,
7075+
ref bank_forks,
7076+
..
7077+
} = vote_simulator;
7078+
7079+
let (mut ancestors, mut descendants) = {
7080+
let r_bank_forks = bank_forks.read().unwrap();
7081+
(r_bank_forks.ancestors(), r_bank_forks.descendants())
7082+
};
7083+
7084+
// Insert different versions of both 1 and 2. Although normally these slots would be dumped,
7085+
// because we were the leader for these slots we should panic
7086+
let mut duplicate_slots_to_repair = DuplicateSlotsToRepair::default();
7087+
duplicate_slots_to_repair.insert(1, Hash::new_unique());
7088+
duplicate_slots_to_repair.insert(2, Hash::new_unique());
7089+
let mut purge_repair_slot_counter = PurgeRepairSlotCounter::default();
7090+
let (dumped_slots_sender, _) = unbounded();
7091+
7092+
ReplayStage::dump_then_repair_correct_slots(
7093+
&mut duplicate_slots_to_repair,
7094+
&mut ancestors,
7095+
&mut descendants,
7096+
progress,
7097+
bank_forks,
7098+
blockstore,
7099+
None,
7100+
&mut purge_repair_slot_counter,
7101+
&dumped_slots_sender,
7102+
my_pubkey,
7103+
leader_schedule_cache,
7104+
);
7105+
}
7106+
70347107
fn run_compute_and_select_forks(
70357108
bank_forks: &RwLock<BankForks>,
70367109
progress: &mut ProgressMap,

0 commit comments

Comments
 (0)