Skip to content

Commit 9151436

Browse files
Fix test OOM issues on tree-states-hot (#7176)
* Try rebasing * Heaptrack * Try to prevent and detect circular references * Don't use StateRootsIterator in get_ancestor_state_root * Skip one slot in get_ancestor_state_root * Heaptrack clippy fix * Fix store test * Log diff base state root * Jump by slots per historical vector * Maybe ok fix for test * get_ancestor_state_root * lint * Implement downgrade from v24 * Better on state write * Log on ws test chain dump * Fix WSS tests * Tweak condition * Revert "Heaptrack" This reverts commit f69c38c. * Reduce diff * Use debug repr * Spelling --------- Co-authored-by: dapplion <[email protected]>
1 parent b49272f commit 9151436

File tree

12 files changed

+409
-93
lines changed

12 files changed

+409
-93
lines changed

beacon_node/beacon_chain/src/builder.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,8 @@ where
380380
self.pending_io_batch.push(
381381
store
382382
.init_anchor_info(
383-
genesis_beacon_block.message().parent_root(),
383+
genesis_beacon_block.parent_root(),
384+
genesis_beacon_block.slot(),
384385
Slot::new(0),
385386
retain_historic_states,
386387
)
@@ -532,14 +533,21 @@ where
532533
.cold_db
533534
.do_atomically(block_root_batch)
534535
.map_err(|e| format!("Error writing frozen block roots: {e:?}"))?;
536+
debug!(
537+
from = %weak_subj_block.slot(),
538+
to_excl = %weak_subj_state.slot(),
539+
block_root = ?weak_subj_block_root,
540+
"Stored frozen block roots at skipped slots"
541+
);
535542

536543
// Write the anchor to memory before calling `put_state` otherwise hot hdiff can't store
537544
// states that do not align with the start_slot grid
538545
let retain_historic_states = self.chain_config.reconstruct_historic_states;
539546
self.pending_io_batch.push(
540547
store
541548
.init_anchor_info(
542-
weak_subj_block.message().parent_root(),
549+
weak_subj_block.parent_root(),
550+
weak_subj_block.slot(),
543551
weak_subj_slot,
544552
retain_historic_states,
545553
)

beacon_node/beacon_chain/src/historical_blocks.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
166166

167167
// Store block roots, including at all skip slots in the freezer DB.
168168
for slot in (block.slot().as_u64()..prev_block_slot.as_u64()).rev() {
169+
debug!(%slot, ?block_root, "Storing frozen block to root mapping");
169170
cold_batch.push(KeyValueStoreOp::PutKeyValue(
170171
DBColumn::BeaconBlockRoots,
171172
slot.to_be_bytes().to_vec(),

beacon_node/beacon_chain/src/migrate.rs

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -471,17 +471,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
471471
let state_summaries = store
472472
.load_hot_state_summaries()?
473473
.into_iter()
474-
.map(|(state_root, summary)| {
475-
(
476-
state_root,
477-
DAGStateSummary {
478-
slot: summary.slot,
479-
latest_block_root: summary.latest_block_root,
480-
latest_block_slot: summary.latest_block_slot,
481-
previous_state_root: summary.previous_state_root,
482-
},
483-
)
484-
})
474+
.map(|(state_root, summary)| (state_root, summary.into()))
485475
.collect::<Vec<(Hash256, DAGStateSummary)>>();
486476

487477
// Sanity check, there is at least one summary with the new finalized block root

beacon_node/beacon_chain/src/schema_change/migration_schema_v24.rs

Lines changed: 112 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
use crate::{
22
beacon_chain::BeaconChainTypes,
3-
summaries_dag::{DAGStateSummaryV22, StateSummariesDAG},
3+
summaries_dag::{DAGStateSummary, DAGStateSummaryV22, StateSummariesDAG},
44
};
5-
use ssz::Decode;
5+
use ssz::{Decode, Encode};
66
use ssz_derive::{Decode, Encode};
77
use std::{
88
sync::Arc,
99
time::{Duration, Instant},
1010
};
1111
use store::{
12-
get_full_state_v22, hdiff::StorageStrategy, hot_cold_store::DiffBaseStateRoot, DBColumn, Error,
13-
HotColdDB, HotStateSummary, KeyValueStore, KeyValueStoreOp, StoreItem,
12+
get_full_state_v22, hdiff::StorageStrategy, hot_cold_store::DiffBaseStateRoot,
13+
store_full_state_v22, DBColumn, Error, HotColdDB, HotStateSummary, KeyValueStore,
14+
KeyValueStoreOp, StoreItem,
1415
};
1516
use tracing::{debug, info, warn};
1617
use types::{EthSpec, Hash256, Slot};
@@ -94,8 +95,8 @@ pub fn upgrade_to_v24<T: BeaconChainTypes>(
9495
slots_count = summaries_by_slot.len(),
9596
min_slot = ?summaries_by_slot.first_key_value().map(|(slot, _)| slot),
9697
max_slot = ?summaries_by_slot.last_key_value().map(|(slot, _)| slot),
97-
state_summaries_dag_roots = ?state_summaries_dag_roots,
98-
hot_hdiff_start_slot = %hot_hdiff_start_slot,
98+
?state_summaries_dag_roots,
99+
%hot_hdiff_start_slot,
99100
split_state_root = ?split.state_root,
100101
"Starting hot states migration"
101102
);
@@ -178,21 +179,22 @@ pub fn upgrade_to_v24<T: BeaconChainTypes>(
178179
})?
179180
};
180181

181-
let diff_base_state_root =
182-
if let Some(diff_base_slot) = storage_strategy.diff_base_slot() {
183-
DiffBaseStateRoot::new(
182+
let diff_base_state_root = if let Some(diff_base_slot) =
183+
storage_strategy.diff_base_slot()
184+
{
185+
DiffBaseStateRoot::new(
184186
diff_base_slot,
185187
state_summaries_dag
186188
.ancestor_state_root_at_slot(state_root, diff_base_slot)
187189
.map_err(|e| {
188190
Error::MigrationError(format!(
189-
"error computing ancestor_state_root_at_slot {e:?}"
191+
"error computing ancestor_state_root_at_slot({state_root:?}, {diff_base_slot}) {e:?}"
190192
))
191193
})?,
192194
)
193-
} else {
194-
DiffBaseStateRoot::zero()
195-
};
195+
} else {
196+
DiffBaseStateRoot::zero()
197+
};
196198

197199
let new_summary = HotStateSummary {
198200
slot,
@@ -227,8 +229,8 @@ pub fn upgrade_to_v24<T: BeaconChainTypes>(
227229
if last_log_time.elapsed() > Duration::from_secs(5) {
228230
last_log_time = Instant::now();
229231
info!(
230-
diffs_written = diffs_written,
231-
summaries_written = summaries_written,
232+
diffs_written,
233+
summaries_written,
232234
summaries_count = state_summaries_dag.summaries_count(),
233235
"Hot states migration in progress"
234236
);
@@ -239,8 +241,8 @@ pub fn upgrade_to_v24<T: BeaconChainTypes>(
239241
// TODO(hdiff): Should run hot DB compaction after deleting potentially a lot of states. Or should wait
240242
// for the next finality event?
241243
info!(
242-
diffs_written = diffs_written,
243-
summaries_written = summaries_written,
244+
diffs_written,
245+
summaries_written,
244246
summaries_count = state_summaries_dag.summaries_count(),
245247
"Hot states migration complete"
246248
);
@@ -249,10 +251,100 @@ pub fn upgrade_to_v24<T: BeaconChainTypes>(
249251
}
250252

251253
pub fn downgrade_from_v24<T: BeaconChainTypes>(
252-
_db: Arc<HotColdDB<T::EthSpec, T::HotStore, T::ColdStore>>,
254+
db: Arc<HotColdDB<T::EthSpec, T::HotStore, T::ColdStore>>,
253255
) -> Result<Vec<KeyValueStoreOp>, Error> {
254-
// TODO(hdiff): proper error
255-
panic!("downgrade not supported");
256+
let state_summaries = db
257+
.load_hot_state_summaries()?
258+
.into_iter()
259+
.map(|(state_root, summary)| (state_root, summary.into()))
260+
.collect::<Vec<(Hash256, DAGStateSummary)>>();
261+
262+
info!(
263+
summaries_count = state_summaries.len(),
264+
"DB downgrade of v24 state summaries started"
265+
);
266+
267+
let state_summaries_dag = StateSummariesDAG::new(state_summaries)
268+
.map_err(|e| Error::MigrationError(format!("Error on new StateSumariesDAG {e:?}")))?;
269+
270+
let mut migrate_ops = vec![];
271+
let mut states_written = 0;
272+
let mut summaries_written = 0;
273+
let mut last_log_time = Instant::now();
274+
275+
// TODO(tree-states): What about the anchor_slot? Is it safe to run the prior version of
276+
// Lighthouse with an a higher anchor_slot than expected?
277+
278+
for (state_root, summary) in state_summaries_dag
279+
.summaries_by_slot_ascending()
280+
.into_iter()
281+
.flat_map(|(_, summaries)| summaries)
282+
{
283+
// If boundary state persist
284+
if summary.slot % T::EthSpec::slots_per_epoch() == 0 {
285+
let (state, _) = db
286+
.load_hot_state(&state_root)?
287+
.ok_or(Error::MissingState(state_root))?;
288+
289+
// Immediately commit the state. Otherwise we will OOM and it's stored in a different
290+
// column. So if the migration crashes we just get extra harmless junk in the DB.
291+
let mut state_write_ops = vec![];
292+
store_full_state_v22(&state_root, &state, &mut state_write_ops)?;
293+
db.hot_db.do_atomically(state_write_ops)?;
294+
states_written += 1;
295+
}
296+
297+
// Persist old summary
298+
let epoch_boundary_state_slot = summary.slot - summary.slot % T::EthSpec::slots_per_epoch();
299+
let old_summary = HotStateSummaryV22 {
300+
slot: summary.slot,
301+
latest_block_root: summary.latest_block_root,
302+
epoch_boundary_state_root: state_summaries_dag
303+
.ancestor_state_root_at_slot(state_root, epoch_boundary_state_slot)
304+
.map_err(|e| {
305+
Error::MigrationError(format!(
306+
"error computing ancestor_state_root_at_slot({state_root:?}, {epoch_boundary_state_slot}) {e:?}"
307+
))
308+
})?,
309+
};
310+
migrate_ops.push(KeyValueStoreOp::PutKeyValue(
311+
DBColumn::BeaconStateSummary,
312+
state_root.as_slice().to_vec(),
313+
old_summary.as_ssz_bytes(),
314+
));
315+
summaries_written += 1;
316+
317+
// Delete existing data
318+
for db_column in [
319+
DBColumn::BeaconStateHotSummary,
320+
DBColumn::BeaconStateHotDiff,
321+
DBColumn::BeaconStateHotSnapshot,
322+
] {
323+
migrate_ops.push(KeyValueStoreOp::DeleteKey(
324+
db_column,
325+
state_root.as_slice().to_vec(),
326+
));
327+
}
328+
329+
if last_log_time.elapsed() > Duration::from_secs(5) {
330+
last_log_time = Instant::now();
331+
info!(
332+
states_written,
333+
summaries_written,
334+
summaries_count = state_summaries_dag.summaries_count(),
335+
"DB downgrade of v24 state summaries in progress"
336+
);
337+
}
338+
}
339+
340+
info!(
341+
states_written,
342+
summaries_written,
343+
summaries_count = state_summaries_dag.summaries_count(),
344+
"DB downgrade of v24 state summaries completed"
345+
);
346+
347+
Ok(migrate_ops)
256348
}
257349

258350
fn new_dag<T: BeaconChainTypes>(

beacon_node/beacon_chain/src/summaries_dag.rs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use std::{
33
cmp::Ordering,
44
collections::{btree_map::Entry, BTreeMap, HashMap},
55
};
6+
use store::HotStateSummary;
67
use types::{Hash256, Slot};
78

89
#[derive(Debug, Clone, Copy)]
@@ -64,6 +65,12 @@ pub enum Error {
6465
root_state_root: Hash256,
6566
root_state_slot: Slot,
6667
},
68+
CircularAncestorChain {
69+
state_root: Hash256,
70+
previous_state_root: Hash256,
71+
slot: Slot,
72+
last_slot: Slot,
73+
},
6774
}
6875

6976
impl StateSummariesDAG {
@@ -335,10 +342,24 @@ impl StateSummariesDAG {
335342
}
336343

337344
let mut ancestors = vec![];
345+
let mut last_slot = None;
338346
loop {
339347
if let Some(summary) = self.state_summaries_by_state_root.get(&state_root) {
348+
// Detect cycles, including the case where `previous_state_root == state_root`.
349+
if let Some(last_slot) = last_slot {
350+
if summary.slot >= last_slot {
351+
return Err(Error::CircularAncestorChain {
352+
state_root,
353+
previous_state_root: summary.previous_state_root,
354+
slot: summary.slot,
355+
last_slot,
356+
});
357+
}
358+
}
359+
340360
ancestors.push((state_root, summary.slot));
341-
state_root = summary.previous_state_root
361+
last_slot = Some(summary.slot);
362+
state_root = summary.previous_state_root;
342363
} else {
343364
return Ok(ancestors);
344365
}
@@ -360,6 +381,17 @@ impl StateSummariesDAG {
360381
}
361382
}
362383

384+
impl From<HotStateSummary> for DAGStateSummary {
385+
fn from(value: HotStateSummary) -> Self {
386+
Self {
387+
slot: value.slot,
388+
latest_block_root: value.latest_block_root,
389+
latest_block_slot: value.latest_block_slot,
390+
previous_state_root: value.previous_state_root,
391+
}
392+
}
393+
}
394+
363395
#[cfg(test)]
364396
mod tests {
365397
use super::{DAGStateSummaryV22, Error, StateSummariesDAG};

0 commit comments

Comments
 (0)