Skip to content

Commit 6f45ad4

Browse files
authored
Log stuck lookups (#5778)
* Log stuck lookups every interval * Implement debug manually * Add comment * Do not print peers twice * Add SYNC_LOOKUPS_STUCK metric * Skip logging request root * use derivative * Merge branch 'unstable' of https://github.com/sigp/lighthouse into log-stuck-lookups * add req id to debug * Merge remote-tracking branch 'sigp/unstable' into log-stuck-lookups * Fix conflict with unstable
1 parent 683d9df commit 6f45ad4

File tree

4 files changed

+73
-3
lines changed

4 files changed

+73
-3
lines changed

beacon_node/network/src/metrics.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,10 @@ lazy_static! {
257257
"sync_lookups_completed_total",
258258
"Total count of sync lookups completed",
259259
);
260+
pub static ref SYNC_LOOKUPS_STUCK: Result<IntGauge> = try_create_int_gauge(
261+
"sync_lookups_stuck",
262+
"Current count of sync lookups that may be stuck",
263+
);
260264

261265
/*
262266
* Block Delay Metrics

beacon_node/network/src/sync/block_lookups/mod.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ mod tests;
3030

3131
const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60;
3232
pub const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 4;
33+
const LOOKUP_MAX_DURATION_SECS: u64 = 60;
3334

3435
pub enum BlockComponent<E: EthSpec> {
3536
Block(DownloadResult<Arc<SignedBeaconBlock<E>>>),
@@ -665,4 +666,21 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
665666
self.single_block_lookups.len() as i64,
666667
);
667668
}
669+
670+
pub fn log_stuck_lookups(&self) {
671+
let mut stuck_count = 0;
672+
for lookup in self.single_block_lookups.values() {
673+
if lookup.elapsed_since_created() > Duration::from_secs(LOOKUP_MAX_DURATION_SECS) {
674+
debug!(self.log, "Lookup maybe stuck";
675+
// Fields id and block_root are also part of the summary. However, logging them
676+
// here allows log parsers o index them and have better search
677+
"id" => lookup.id,
678+
"block_root" => ?lookup.block_root(),
679+
"summary" => ?lookup,
680+
);
681+
stuck_count += 1;
682+
}
683+
}
684+
metrics::set_gauge(&metrics::SYNC_LOOKUPS_STUCK, stuck_count);
685+
}
668686
}

beacon_node/network/src/sync/block_lookups/single_block_lookup.rs

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@ use crate::sync::block_lookups::common::RequestState;
44
use crate::sync::block_lookups::Id;
55
use crate::sync::network_context::{LookupRequestResult, ReqId, SyncNetworkContext};
66
use beacon_chain::BeaconChainTypes;
7+
use derivative::Derivative;
78
use itertools::Itertools;
89
use rand::seq::IteratorRandom;
910
use std::collections::HashSet;
1011
use std::fmt::Debug;
1112
use std::sync::Arc;
12-
use std::time::Duration;
13+
use std::time::{Duration, Instant};
1314
use store::Hash256;
1415
use strum::IntoStaticStr;
1516
use types::blob_sidecar::FixedBlobSidecarList;
@@ -53,12 +54,15 @@ pub enum LookupRequestError {
5354
},
5455
}
5556

57+
#[derive(Derivative)]
58+
#[derivative(Debug(bound = "T: BeaconChainTypes"))]
5659
pub struct SingleBlockLookup<T: BeaconChainTypes> {
5760
pub id: Id,
5861
pub block_request_state: BlockRequestState<T::EthSpec>,
5962
pub blob_request_state: BlobRequestState<T::EthSpec>,
6063
block_root: Hash256,
6164
awaiting_parent: Option<Hash256>,
65+
created: Instant,
6266
}
6367

6468
impl<T: BeaconChainTypes> SingleBlockLookup<T> {
@@ -74,6 +78,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
7478
blob_request_state: BlobRequestState::new(requested_block_root, peers),
7579
block_root: requested_block_root,
7680
awaiting_parent,
81+
created: Instant::now(),
7782
}
7883
}
7984

@@ -98,6 +103,11 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
98103
self.awaiting_parent = None;
99104
}
100105

106+
/// Returns the time elapsed since this lookup was created
107+
pub fn elapsed_since_created(&self) -> Duration {
108+
self.created.elapsed()
109+
}
110+
101111
/// Maybe insert a verified response into this lookup. Returns true if imported
102112
pub fn add_child_components(&mut self, block_component: BlockComponent<T::EthSpec>) -> bool {
103113
match block_component {
@@ -244,7 +254,10 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
244254
}
245255

246256
/// The state of the blob request component of a `SingleBlockLookup`.
257+
#[derive(Derivative)]
258+
#[derivative(Debug)]
247259
pub struct BlobRequestState<E: EthSpec> {
260+
#[derivative(Debug = "ignore")]
248261
pub block_root: Hash256,
249262
pub state: SingleLookupRequestState<FixedBlobSidecarList<E>>,
250263
}
@@ -259,7 +272,10 @@ impl<E: EthSpec> BlobRequestState<E> {
259272
}
260273

261274
/// The state of the block request component of a `SingleBlockLookup`.
275+
#[derive(Derivative)]
276+
#[derivative(Debug)]
262277
pub struct BlockRequestState<E: EthSpec> {
278+
#[derivative(Debug = "ignore")]
263279
pub requested_block_root: Hash256,
264280
pub state: SingleLookupRequestState<Arc<SignedBeaconBlock<E>>>,
265281
}
@@ -281,7 +297,7 @@ pub struct DownloadResult<T: Clone> {
281297
pub peer_id: PeerId,
282298
}
283299

284-
#[derive(Debug, PartialEq, Eq, IntoStaticStr)]
300+
#[derive(PartialEq, Eq, IntoStaticStr)]
285301
pub enum State<T: Clone> {
286302
AwaitingDownload,
287303
Downloading(ReqId),
@@ -293,13 +309,16 @@ pub enum State<T: Clone> {
293309
}
294310

295311
/// Object representing the state of a single block or blob lookup request.
296-
#[derive(PartialEq, Eq, Debug)]
312+
#[derive(PartialEq, Eq, Derivative)]
313+
#[derivative(Debug)]
297314
pub struct SingleLookupRequestState<T: Clone> {
298315
/// State of this request.
299316
state: State<T>,
300317
/// Peers that should have this block or blob.
318+
#[derivative(Debug(format_with = "fmt_peer_set"))]
301319
available_peers: HashSet<PeerId>,
302320
/// Peers from which we have requested this block.
321+
#[derivative(Debug = "ignore")]
303322
used_peers: HashSet<PeerId>,
304323
/// How many times have we attempted to process this block or blob.
305324
failed_processing: u8,
@@ -529,8 +548,30 @@ impl<T: Clone> SingleLookupRequestState<T> {
529548
}
530549
}
531550

551+
// Display is used in the BadState assertions above
532552
impl<T: Clone> std::fmt::Display for State<T> {
533553
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
534554
write!(f, "{}", Into::<&'static str>::into(self))
535555
}
536556
}
557+
558+
// Debug is used in the log_stuck_lookups print to include some more info. Implements custom Debug
559+
// to not dump an entire block or blob to terminal which don't add valuable data.
560+
impl<T: Clone> std::fmt::Debug for State<T> {
561+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
562+
match self {
563+
Self::AwaitingDownload { .. } => write!(f, "AwaitingDownload"),
564+
Self::Downloading(req_id) => write!(f, "Downloading({:?})", req_id),
565+
Self::AwaitingProcess(d) => write!(f, "AwaitingProcess({:?})", d.peer_id),
566+
Self::Processing(d) => write!(f, "Processing({:?})", d.peer_id),
567+
Self::Processed { .. } => write!(f, "Processed"),
568+
}
569+
}
570+
}
571+
572+
fn fmt_peer_set(
573+
peer_set: &HashSet<PeerId>,
574+
f: &mut std::fmt::Formatter,
575+
) -> Result<(), std::fmt::Error> {
576+
write!(f, "{}", peer_set.len())
577+
}

beacon_node/network/src/sync/manager.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,10 @@ impl<T: BeaconChainTypes> SyncManager<T> {
547547
futures::stream::iter(ee_responsiveness_watch.await).flatten()
548548
};
549549

550+
// LOOKUP_MAX_DURATION_SECS is 60 seconds. Logging every 30 seconds allows enough timely
551+
// visbility while being sparse and not increasing the debug log volume in a noticeable way
552+
let mut interval = tokio::time::interval(Duration::from_secs(30));
553+
550554
// process any inbound messages
551555
loop {
552556
tokio::select! {
@@ -556,6 +560,9 @@ impl<T: BeaconChainTypes> SyncManager<T> {
556560
Some(engine_state) = check_ee_stream.next(), if check_ee => {
557561
self.handle_new_execution_engine_state(engine_state);
558562
}
563+
_ = interval.tick() => {
564+
self.block_lookups.log_stuck_lookups();
565+
}
559566
}
560567
}
561568
}

0 commit comments

Comments
 (0)