Skip to content

Commit 13d1234

Browse files
jxspawanjay176
authored andcommitted
improve libp2p connected peer metrics (sigp#5314)
* patch rust-yamux dep * improve libp2p connected peer metrics
1 parent 1009eda commit 13d1234

File tree

8 files changed

+64
-113
lines changed

8 files changed

+64
-113
lines changed

Cargo.lock

Lines changed: 1 addition & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ validator_client = { path = "validator_client" }
231231
validator_dir = { path = "common/validator_dir" }
232232
warp_utils = { path = "common/warp_utils" }
233233

234+
[patch.crates-io]
235+
yamux = { git = "https://github.com/sigp/rust-yamux.git" }
236+
234237
[profile.maxperf]
235238
inherits = "release"
236239
lto = "fat"

beacon_node/http_api/src/lib.rs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ use std::path::PathBuf;
6868
use std::pin::Pin;
6969
use std::sync::Arc;
7070
use sysinfo::{System, SystemExt};
71-
use system_health::observe_system_health_bn;
71+
use system_health::{observe_nat, observe_system_health_bn};
7272
use task_spawner::{Priority, TaskSpawner};
7373
use tokio::sync::{
7474
mpsc::{Sender, UnboundedSender},
@@ -3966,13 +3966,7 @@ pub fn serve<T: BeaconChainTypes>(
39663966
.and(warp::path::end())
39673967
.then(|task_spawner: TaskSpawner<T::EthSpec>| {
39683968
task_spawner.blocking_json_task(Priority::P1, move || {
3969-
Ok(api_types::GenericResponse::from(
3970-
lighthouse_network::metrics::NAT_OPEN
3971-
.as_ref()
3972-
.map(|v| v.get())
3973-
.unwrap_or(0)
3974-
!= 0,
3975-
))
3969+
Ok(api_types::GenericResponse::from(observe_nat()))
39763970
})
39773971
});
39783972

beacon_node/lighthouse_network/src/discovery/mod.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1004,7 +1004,10 @@ impl<TSpec: EthSpec> NetworkBehaviour for Discovery<TSpec> {
10041004
discv5::Event::SocketUpdated(socket_addr) => {
10051005
info!(self.log, "Address updated"; "ip" => %socket_addr.ip(), "udp_port" => %socket_addr.port());
10061006
metrics::inc_counter(&metrics::ADDRESS_UPDATE_COUNT);
1007-
metrics::check_nat();
1007+
// We have SOCKET_UPDATED messages. This occurs when discovery has a majority of
1008+
// users reporting an external port and our ENR gets updated.
1009+
// Which means we are able to do NAT traversal.
1010+
metrics::set_gauge_vec(&metrics::NAT_OPEN, &["discv5"], 1);
10081011
// Discv5 will have updated our local ENR. We save the updated version
10091012
// to disk.
10101013

beacon_node/lighthouse_network/src/metrics.rs

Lines changed: 12 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
pub use lighthouse_metrics::*;
22

33
lazy_static! {
4-
pub static ref NAT_OPEN: Result<IntCounter> = try_create_int_counter(
4+
pub static ref NAT_OPEN: Result<IntGaugeVec> = try_create_int_gauge_vec(
55
"nat_open",
6-
"An estimate indicating if the local node is exposed to the internet."
6+
"An estimate indicating if the local node is reachable from external nodes",
7+
&["protocol"]
78
);
89
pub static ref ADDRESS_UPDATE_COUNT: Result<IntCounter> = try_create_int_counter(
910
"libp2p_address_update_total",
@@ -14,6 +15,9 @@ lazy_static! {
1415
"Count of libp2p peers currently connected"
1516
);
1617

18+
pub static ref PEERS_CONNECTED_MULTI: Result<IntGaugeVec> =
19+
try_create_int_gauge_vec("libp2p_peers_multi", "Count of libp2p peers currently connected", &["direction", "transport"]);
20+
1721
pub static ref TCP_PEERS_CONNECTED: Result<IntGauge> = try_create_int_gauge(
1822
"libp2p_tcp_peers",
1923
"Count of libp2p peers currently connected via TCP"
@@ -32,13 +36,10 @@ lazy_static! {
3236
"libp2p_peer_disconnect_event_total",
3337
"Count of libp2p peer disconnect events"
3438
);
35-
pub static ref DISCOVERY_SENT_BYTES: Result<IntGauge> = try_create_int_gauge(
36-
"discovery_sent_bytes",
37-
"The number of bytes sent in discovery"
38-
);
39-
pub static ref DISCOVERY_RECV_BYTES: Result<IntGauge> = try_create_int_gauge(
40-
"discovery_recv_bytes",
41-
"The number of bytes received in discovery"
39+
pub static ref DISCOVERY_BYTES: Result<IntGaugeVec> = try_create_int_gauge_vec(
40+
"discovery_bytes",
41+
"The number of bytes sent and received in discovery",
42+
&["direction"]
4243
);
4344
pub static ref DISCOVERY_QUEUE: Result<IntGauge> = try_create_int_gauge(
4445
"discovery_queue_size",
@@ -135,17 +136,6 @@ lazy_static! {
135136
&["type"]
136137
);
137138

138-
/*
139-
* Inbound/Outbound peers
140-
*/
141-
/// The number of peers that dialed us.
142-
pub static ref NETWORK_INBOUND_PEERS: Result<IntGauge> =
143-
try_create_int_gauge("network_inbound_peers","The number of peers that are currently connected that have dialed us.");
144-
145-
/// The number of peers that we dialed us.
146-
pub static ref NETWORK_OUTBOUND_PEERS: Result<IntGauge> =
147-
try_create_int_gauge("network_outbound_peers","The number of peers that are currently connected that we dialed.");
148-
149139
/*
150140
* Peer Reporting
151141
*/
@@ -156,31 +146,11 @@ lazy_static! {
156146
);
157147
}
158148

159-
/// Checks if we consider the NAT open.
160-
///
161-
/// Conditions for an open NAT:
162-
/// 1. We have 1 or more SOCKET_UPDATED messages. This occurs when discovery has a majority of
163-
/// users reporting an external port and our ENR gets updated.
164-
/// 2. We have 0 SOCKET_UPDATED messages (can be true if the port was correct on boot), then we
165-
/// rely on whether we have any inbound messages. If we have no socket update messages, but
166-
/// manage to get at least one inbound peer, we are exposed correctly.
167-
pub fn check_nat() {
168-
// NAT is already deemed open.
169-
if NAT_OPEN.as_ref().map(|v| v.get()).unwrap_or(0) != 0 {
170-
return;
171-
}
172-
if ADDRESS_UPDATE_COUNT.as_ref().map(|v| v.get()).unwrap_or(0) != 0
173-
|| NETWORK_INBOUND_PEERS.as_ref().map(|v| v.get()).unwrap_or(0) != 0_i64
174-
{
175-
inc_counter(&NAT_OPEN);
176-
}
177-
}
178-
179149
pub fn scrape_discovery_metrics() {
180150
let metrics =
181151
discv5::metrics::Metrics::from(discv5::Discv5::<discv5::DefaultProtocolId>::raw_metrics());
182152
set_float_gauge(&DISCOVERY_REQS, metrics.unsolicited_requests_per_second);
183153
set_gauge(&DISCOVERY_SESSIONS, metrics.active_sessions as i64);
184-
set_gauge(&DISCOVERY_SENT_BYTES, metrics.bytes_sent as i64);
185-
set_gauge(&DISCOVERY_RECV_BYTES, metrics.bytes_recv as i64);
154+
set_gauge_vec(&DISCOVERY_BYTES, &["inbound"], metrics.bytes_recv as i64);
155+
set_gauge_vec(&DISCOVERY_BYTES, &["outbound"], metrics.bytes_sent as i64);
186156
}

beacon_node/lighthouse_network/src/peer_manager/mod.rs

Lines changed: 1 addition & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,14 @@ use delay_map::HashSetDelay;
1010
use discv5::Enr;
1111
use libp2p::identify::Info as IdentifyInfo;
1212
use lru_cache::LRUTimeCache;
13-
use peerdb::{client::ClientKind, BanOperation, BanResult, ScoreUpdateResult};
13+
use peerdb::{BanOperation, BanResult, ScoreUpdateResult};
1414
use rand::seq::SliceRandom;
1515
use slog::{debug, error, trace, warn};
1616
use smallvec::SmallVec;
1717
use std::{
1818
sync::Arc,
1919
time::{Duration, Instant},
2020
};
21-
use strum::IntoEnumIterator;
2221
use types::{EthSpec, SyncSubnetId};
2322

2423
pub use libp2p::core::Multiaddr;
@@ -719,46 +718,6 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
719718
}
720719
}
721720

722-
// This function updates metrics for all connected peers.
723-
fn update_connected_peer_metrics(&self) {
724-
// Do nothing if we don't have metrics enabled.
725-
if !self.metrics_enabled {
726-
return;
727-
}
728-
729-
let mut connected_peer_count = 0;
730-
let mut inbound_connected_peers = 0;
731-
let mut outbound_connected_peers = 0;
732-
let mut clients_per_peer = HashMap::new();
733-
734-
for (_peer, peer_info) in self.network_globals.peers.read().connected_peers() {
735-
connected_peer_count += 1;
736-
if let PeerConnectionStatus::Connected { n_in, .. } = peer_info.connection_status() {
737-
if *n_in > 0 {
738-
inbound_connected_peers += 1;
739-
} else {
740-
outbound_connected_peers += 1;
741-
}
742-
}
743-
*clients_per_peer
744-
.entry(peer_info.client().kind.to_string())
745-
.or_default() += 1;
746-
}
747-
748-
metrics::set_gauge(&metrics::PEERS_CONNECTED, connected_peer_count);
749-
metrics::set_gauge(&metrics::NETWORK_INBOUND_PEERS, inbound_connected_peers);
750-
metrics::set_gauge(&metrics::NETWORK_OUTBOUND_PEERS, outbound_connected_peers);
751-
752-
for client_kind in ClientKind::iter() {
753-
let value = clients_per_peer.get(&client_kind.to_string()).unwrap_or(&0);
754-
metrics::set_gauge_vec(
755-
&metrics::PEERS_PER_CLIENT,
756-
&[client_kind.as_ref()],
757-
*value as i64,
758-
);
759-
}
760-
}
761-
762721
/* Internal functions */
763722

764723
/// Sets a peer as connected as long as their reputation allows it

beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,8 @@ impl<TSpec: EthSpec> NetworkBehaviour for PeerManager<TSpec> {
154154
self.on_dial_failure(peer_id);
155155
}
156156
FromSwarm::ExternalAddrConfirmed(_) => {
157-
// TODO: we likely want to check this against our assumed external tcp
158-
// address
157+
// We have an external address confirmed, means we are able to do NAT traversal.
158+
metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p"], 1);
159159
}
160160
_ => {
161161
// NOTE: FromSwarm is a non exhaustive enum so updates should be based on release
@@ -243,33 +243,34 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
243243
self.events.push(PeerManagerEvent::MetaData(peer_id));
244244
}
245245

246-
// Check NAT if metrics are enabled
247-
if self.network_globals.local_enr.read().udp4().is_some() {
248-
metrics::check_nat();
249-
}
250-
251246
// increment prometheus metrics
252247
if self.metrics_enabled {
253248
let remote_addr = endpoint.get_remote_address();
249+
let direction = if endpoint.is_dialer() {
250+
"outbound"
251+
} else {
252+
"inbound"
253+
};
254+
254255
match remote_addr.iter().find(|proto| {
255256
matches!(
256257
proto,
257258
multiaddr::Protocol::QuicV1 | multiaddr::Protocol::Tcp(_)
258259
)
259260
}) {
260261
Some(multiaddr::Protocol::QuicV1) => {
261-
metrics::inc_gauge(&metrics::QUIC_PEERS_CONNECTED);
262+
metrics::inc_gauge_vec(&metrics::PEERS_CONNECTED_MULTI, &[direction, "quic"]);
262263
}
263264
Some(multiaddr::Protocol::Tcp(_)) => {
264-
metrics::inc_gauge(&metrics::TCP_PEERS_CONNECTED);
265+
metrics::inc_gauge_vec(&metrics::PEERS_CONNECTED_MULTI, &[direction, "tcp"]);
265266
}
266267
Some(_) => unreachable!(),
267268
None => {
268269
error!(self.log, "Connection established via unknown transport"; "addr" => %remote_addr)
269270
}
270271
};
271272

272-
self.update_connected_peer_metrics();
273+
metrics::inc_gauge(&metrics::PEERS_CONNECTED);
273274
metrics::inc_counter(&metrics::PEER_CONNECT_EVENT_COUNT);
274275
}
275276

@@ -339,22 +340,29 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
339340
let remote_addr = endpoint.get_remote_address();
340341
// Update the prometheus metrics
341342
if self.metrics_enabled {
343+
let direction = if endpoint.is_dialer() {
344+
"outbound"
345+
} else {
346+
"inbound"
347+
};
348+
342349
match remote_addr.iter().find(|proto| {
343350
matches!(
344351
proto,
345352
multiaddr::Protocol::QuicV1 | multiaddr::Protocol::Tcp(_)
346353
)
347354
}) {
348355
Some(multiaddr::Protocol::QuicV1) => {
349-
metrics::dec_gauge(&metrics::QUIC_PEERS_CONNECTED);
356+
metrics::dec_gauge_vec(&metrics::PEERS_CONNECTED_MULTI, &[direction, "quic"]);
350357
}
351358
Some(multiaddr::Protocol::Tcp(_)) => {
352-
metrics::dec_gauge(&metrics::TCP_PEERS_CONNECTED);
359+
metrics::dec_gauge_vec(&metrics::PEERS_CONNECTED_MULTI, &[direction, "tcp"]);
353360
}
354361
// If it's an unknown protocol we already logged when connection was established.
355362
_ => {}
356363
};
357-
self.update_connected_peer_metrics();
364+
// Legacy standard metrics.
365+
metrics::dec_gauge(&metrics::PEERS_CONNECTED);
358366
metrics::inc_counter(&metrics::PEER_DISCONNECT_EVENT_COUNT);
359367
}
360368
}

common/system_health/src/lib.rs

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,25 @@ pub fn observe_system_health_vc(
198198
}
199199
}
200200

201+
/// Observes if NAT traversal is possible.
202+
pub fn observe_nat() -> bool {
203+
let discv5_nat = lighthouse_network::metrics::get_int_gauge(
204+
&lighthouse_network::metrics::NAT_OPEN,
205+
&["discv5"],
206+
)
207+
.map(|g| g.get() == 1)
208+
.unwrap_or_default();
209+
210+
let libp2p_nat = lighthouse_network::metrics::get_int_gauge(
211+
&lighthouse_network::metrics::NAT_OPEN,
212+
&["libp2p"],
213+
)
214+
.map(|g| g.get() == 1)
215+
.unwrap_or_default();
216+
217+
discv5_nat && libp2p_nat
218+
}
219+
201220
/// Observes the Beacon Node system health.
202221
pub fn observe_system_health_bn<TSpec: EthSpec>(
203222
sysinfo: Arc<RwLock<System>>,
@@ -223,11 +242,7 @@ pub fn observe_system_health_bn<TSpec: EthSpec>(
223242
.unwrap_or_else(|| (String::from("None"), 0, 0));
224243

225244
// Determine if the NAT is open or not.
226-
let nat_open = lighthouse_network::metrics::NAT_OPEN
227-
.as_ref()
228-
.map(|v| v.get())
229-
.unwrap_or(0)
230-
!= 0;
245+
let nat_open = observe_nat();
231246

232247
SystemHealthBN {
233248
system_health,

0 commit comments

Comments
 (0)