Address @jimmygchen review

dapplion · dapplion · commit 23671531bcb2 · 2024-04-29T08:16:38.000+09:00
diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs
@@ -1000,6 +1000,11 @@ impl<E: EthSpec> BeaconProcessor<E> {
                             self.spawn_worker(item, idle_tx);
                         } else if let Some(item) = rpc_blob_queue.pop() {
                             self.spawn_worker(item, idle_tx);
+                        // TODO(das): decide proper priorization for sampling columns
+                        } else if let Some(item) = rpc_verify_data_column_queue.pop() {
+                            self.spawn_worker(item, idle_tx);
+                        } else if let Some(item) = sampling_result_queue.pop() {
+                            self.spawn_worker(item, idle_tx);
                         // Check delayed blocks before gossip blocks, the gossip blocks might rely
                         // on the delayed ones.
                         } else if let Some(item) = delayed_block_queue.pop() {
@@ -1389,6 +1394,14 @@ impl<E: EthSpec> BeaconProcessor<E> {
                     &metrics::BEACON_PROCESSOR_RPC_BLOB_QUEUE_TOTAL,
                     rpc_blob_queue.len() as i64,
                 );
+                metrics::set_gauge(
+                    &metrics::BEACON_PROCESSOR_RPC_VERIFY_DATA_COLUMN_QUEUE_TOTAL,
+                    rpc_blob_queue.len() as i64,
+                );
+                metrics::set_gauge(
+                    &metrics::BEACON_PROCESSOR_SAMPLING_RESULT_QUEUE_TOTAL,
+                    rpc_blob_queue.len() as i64,
+                );
                 metrics::set_gauge(
                     &metrics::BEACON_PROCESSOR_CHAIN_SEGMENT_QUEUE_TOTAL,
                     chain_segment_queue.len() as i64,
diff --git a/beacon_node/beacon_processor/src/metrics.rs b/beacon_node/beacon_processor/src/metrics.rs
@@ -86,6 +86,16 @@ lazy_static::lazy_static! {
         "beacon_processor_rpc_blob_queue_total",
         "Count of blobs from the rpc waiting to be verified."
     );
+    // Rpc verify data columns
+    pub static ref BEACON_PROCESSOR_RPC_VERIFY_DATA_COLUMN_QUEUE_TOTAL: Result<IntGauge> = try_create_int_gauge(
+        "beacon_processor_rpc_verify_data_column_queue_total",
+        "Count of data columns from the rpc waiting to be verified."
+    );
+    // Sampling result
+    pub static ref BEACON_PROCESSOR_SAMPLING_RESULT_QUEUE_TOTAL: Result<IntGauge> = try_create_int_gauge(
+        "beacon_processor_rpc_blob_queue_total",
+        "Count of sampling results waiting to be processed."
+    );
     // Chain segments.
     pub static ref BEACON_PROCESSOR_CHAIN_SEGMENT_QUEUE_TOTAL: Result<IntGauge> = try_create_int_gauge(
         "beacon_processor_chain_segment_queue_total",
diff --git a/beacon_node/lighthouse_network/src/rpc/methods.rs b/beacon_node/lighthouse_network/src/rpc/methods.rs
@@ -379,9 +379,11 @@ pub struct DataColumnsByRootRequest {
 }
 
 impl DataColumnsByRootRequest {
-    pub fn new(blob_ids: Vec<DataColumnIdentifier>, spec: &ChainSpec) -> Self {
-        let data_column_ids =
-            RuntimeVariableList::from_vec(blob_ids, spec.max_request_data_column_sidecars as usize);
+    pub fn new(data_column_ids: Vec<DataColumnIdentifier>, spec: &ChainSpec) -> Self {
+        let data_column_ids = RuntimeVariableList::from_vec(
+            data_column_ids,
+            spec.max_request_data_column_sidecars as usize,
+        );
         Self { data_column_ids }
     }
 }
diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs
@@ -125,6 +125,7 @@ pub enum SyncMessage<E: EthSpec> {
         seen_timestamp: Duration,
     },
 
+    /// A data columns has been received from the RPC
     RpcDataColumn {
         request_id: RequestId,
         peer_id: PeerId,
@@ -1044,25 +1045,22 @@ impl<T: BeaconChainTypes> SyncManager<T> {
         peer_id: PeerId,
         data_column: RpcEvent<Arc<DataColumnSidecar<T::EthSpec>>>,
     ) {
-        let Some((requester, resp)) = self
+        if let Some((requester, resp)) = self
             .network
             .on_data_columns_by_root_response(id, data_column)
-        else {
-            // TOOD(das): error o log
-            return;
-        };
-
-        match requester {
-            DataColumnsByRootRequester::Sampling(id) => {
-                if let Some(result) =
-                    self.sampling
-                        .on_sample_downloaded(id, peer_id, resp, &mut self.network)
-                {
-                    self.on_sampling_result(id.id, result)
+        {
+            match requester {
+                DataColumnsByRootRequester::Sampling(id) => {
+                    if let Some(result) =
+                        self.sampling
+                            .on_sample_downloaded(id, peer_id, resp, &mut self.network)
+                    {
+                        self.on_sampling_result(id.id, result)
+                    }
+                }
+                DataColumnsByRootRequester::Custody => {
+                    todo!("TODO(das): handle custody requests");
                 }
-            }
-            DataColumnsByRootRequester::Custody => {
-                todo!("TODO(das): handle custody requests");
             }
         }
     }
diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs
@@ -20,7 +20,9 @@ use beacon_chain::{BeaconChain, BeaconChainTypes, EngineState};
 use fnv::FnvHashMap;
 use lighthouse_network::rpc::methods::BlobsByRangeRequest;
 use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError};
-use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource, Request};
+use lighthouse_network::{
+    Client, Eth2Enr, NetworkGlobals, PeerAction, PeerId, ReportSource, Request,
+};
 pub use requests::LookupVerifyError;
 use slog::{debug, trace, warn};
 use std::collections::hash_map::Entry;
@@ -59,7 +61,7 @@ pub enum RpcEvent<T> {
     RPCError(RPCError),
 }
 
-pub type RpcProcessingResult<T, ID> = Option<(ID, Result<(T, Duration), LookupFailure>)>;
+pub type RpcProcessingResult<ID, T> = Option<(ID, Result<(T, Duration), LookupFailure>)>;
 
 pub enum LookupFailure {
     RpcError(RPCError),
@@ -163,10 +165,12 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
     pub fn get_custodial_peers(&self, _epoch: Epoch, column_index: ColumnIndex) -> Vec<PeerId> {
         let mut peer_ids = vec![];
 
-        for (peer_id, peer_info) in self.network_globals().peers.read().peers() {
+        for (peer_id, peer_info) in self.network_globals().peers.read().connected_peers() {
             if let Some(enr) = peer_info.enr() {
-                // TODO(das): do not hardcode `custody_subnet_count`
-                let custody_subnet_count = 2;
+                // TODO(das): ignores decode errors
+                let custody_subnet_count = enr
+                    .custody_subnet_count::<T::EthSpec>()
+                    .unwrap_or(T::EthSpec::min_custody_requirement() as u64);
                 // TODO(das): consider caching a map of subnet -> Vec<PeerId> and invalidating
                 // whenever a peer connected or disconnect event in received
                 let mut subnets = DataColumnSubnetId::compute_custody_subnets::<T::EthSpec>(
@@ -551,7 +555,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
         &mut self,
         request_id: SingleLookupReqId,
         block: RpcEvent<Arc<SignedBeaconBlock<T::EthSpec>>>,
-    ) -> RpcProcessingResult<Arc<SignedBeaconBlock<T::EthSpec>>, ()> {
+    ) -> RpcProcessingResult<(), Arc<SignedBeaconBlock<T::EthSpec>>> {
         let Entry::Occupied(mut request) = self.blocks_by_root_requests.entry(request_id) else {
             return None;
         };
@@ -583,7 +587,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
         &mut self,
         request_id: SingleLookupReqId,
         blob: RpcEvent<Arc<BlobSidecar<T::EthSpec>>>,
-    ) -> RpcProcessingResult<FixedBlobSidecarList<T::EthSpec>, ()> {
+    ) -> RpcProcessingResult<(), FixedBlobSidecarList<T::EthSpec>> {
         let Entry::Occupied(mut request) = self.blobs_by_root_requests.entry(request_id) else {
             return None;
         };
@@ -620,7 +624,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
         &mut self,
         id: Id,
         item: RpcEvent<Arc<DataColumnSidecar<T::EthSpec>>>,
-    ) -> RpcProcessingResult<Vec<Arc<DataColumnSidecar<T::EthSpec>>>, DataColumnsByRootRequester>
+    ) -> RpcProcessingResult<DataColumnsByRootRequester, Vec<Arc<DataColumnSidecar<T::EthSpec>>>>
     {
         let Entry::Occupied(mut request) = self.data_columns_by_root_requests.entry(id) else {
             return None;
diff --git a/beacon_node/network/src/sync/sampling.rs b/beacon_node/network/src/sync/sampling.rs
@@ -10,7 +10,7 @@ use std::{
     collections::hash_map::Entry, collections::HashMap, marker::PhantomData, sync::Arc,
     time::Duration,
 };
-use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, Hash256, Slot};
+use types::{data_column_sidecar::ColumnIndex, DataColumnSidecar, EthSpec, Hash256, Slot};
 
 pub type SamplingResult = Result<(), SamplingError>;
 
@@ -48,6 +48,12 @@ impl<T: BeaconChainTypes> Sampling<T> {
         self.requests.values().map(|r| r.block_root).collect()
     }
 
+    /// Create a new sampling request for a known block
+    ///
+    /// ### Returns
+    ///
+    /// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
+    /// - `None`: Request still active, requester should do no action
     pub fn on_new_sample_request(
         &mut self,
         block_root: Hash256,
@@ -81,6 +87,13 @@ impl<T: BeaconChainTypes> Sampling<T> {
             .map(|result| (requester, result))
     }
 
+    /// Insert a downloaded column into an active sampling request. Then make progress on the
+    /// entire request.
+    ///
+    /// ### Returns
+    ///
+    /// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
+    /// - `None`: Request still active, requester should do no action
     pub fn on_sample_downloaded(
         &mut self,
         id: SamplingId,
@@ -98,6 +111,13 @@ impl<T: BeaconChainTypes> Sampling<T> {
         self.handle_sampling_result(result, &id.id)
     }
 
+    /// Insert a downloaded column into an active sampling request. Then make progress on the
+    /// entire request.
+    ///
+    /// ### Returns
+    ///
+    /// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
+    /// - `None`: Request still active, requester should do no action
     pub fn on_sample_verified(
         &mut self,
         id: SamplingId,
@@ -114,14 +134,17 @@ impl<T: BeaconChainTypes> Sampling<T> {
         self.handle_sampling_result(result, &id.id)
     }
 
+    /// Converts a result from the internal format of `ActiveSamplingRequest` (error first to use ?
+    /// conveniently), to an Option first format to use an `if let Some() { act on result }` pattern
+    /// in the sync manager.
     fn handle_sampling_result(
         &mut self,
         result: Result<Option<()>, SamplingError>,
         id: &SamplingRequester,
     ) -> Option<SamplingResult> {
         let result = result.transpose();
         if result.is_some() {
-            debug!(self.log, "Removed sampling request"; "id" => ?id);
+            debug!(self.log, "Remove completed sampling request"; "id" => ?id, "result" => ?result);
             self.requests.remove(id);
         }
         result
@@ -146,6 +169,7 @@ pub enum SamplingError {
     ProcessorUnavailable,
     TooManyFailures,
     BadState(String),
+    ColumnIndexOutOfBounds,
 }
 
 /// Required success index by current failures, with p_target=5.00E-06
@@ -170,7 +194,8 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
         log: slog::Logger,
     ) -> Self {
         // Select ahead of time the full list of to-sample columns
-        let mut column_shuffle = (0..64).collect::<Vec<ColumnIndex>>();
+        let mut column_shuffle = (0..<T::EthSpec as EthSpec>::number_of_columns() as ColumnIndex)
+            .collect::<Vec<ColumnIndex>>();
         let mut rng = thread_rng();
         column_shuffle.shuffle(&mut rng);
 
@@ -189,9 +214,15 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
         }
     }
 
-    // TODO: When is a fork and only a subset of your peers know about a block, sampling should only
-    // be queried on the peers on that fork. Should this case be handled? How to handle it?
-    fn on_sample_downloaded(
+    /// Insert a downloaded column into an active sampling request. Then make progress on the
+    /// entire request.
+    ///
+    /// ### Returns
+    ///
+    /// - `Err`: Sampling request has failed and will be dropped
+    /// - `Ok(Some)`: Sampling request has successfully completed and will be dropped
+    /// - `Ok(None)`: Sampling request still active
+    pub(crate) fn on_sample_downloaded(
         &mut self,
         _peer_id: PeerId,
         column_index: ColumnIndex,
@@ -258,6 +289,14 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
         self.continue_sampling(cx)
     }
 
+    /// Insert a column verification result into an active sampling request. Then make progress
+    /// on the entire request.
+    ///
+    /// ### Returns
+    ///
+    /// - `Err`: Sampling request has failed and will be dropped
+    /// - `Ok(Some)`: Sampling request has successfully completed and will be dropped
+    /// - `Ok(None)`: Sampling request still active
     pub(crate) fn on_sample_verified(
         &mut self,
         column_index: ColumnIndex,
@@ -301,7 +340,7 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
         self.continue_sampling(cx)
     }
 
-    fn continue_sampling(
+    pub(crate) fn continue_sampling(
         &mut self,
         cx: &mut SyncNetworkContext<T>,
     ) -> Result<Option<()>, SamplingError> {
@@ -337,8 +376,11 @@ impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
         // First, attempt to progress sampling by requesting more columns, so that request failures
         // are accounted for below.
         for idx in 0..*required_successes {
-            // Re-request columns
-            let column_index = self.column_shuffle[idx];
+            // Re-request columns. Note: out of bounds error should never happen, inputs are hardcoded
+            let column_index = *self
+                .column_shuffle
+                .get(idx)
+                .ok_or(SamplingError::ColumnIndexOutOfBounds)?;
             let request = self
                 .column_requests
                 .entry(column_index)
@@ -431,6 +473,8 @@ mod request {
                 Status::Verified => return Ok(false),      // Already completed
             }
 
+            // TODO: When is a fork and only a subset of your peers know about a block, sampling should only
+            // be queried on the peers on that fork. Should this case be handled? How to handle it?
             let peer_ids = cx.get_custodial_peers(
                 block_slot.epoch(<T::EthSpec as EthSpec>::slots_per_epoch()),
                 self.column_index,

Original file line number	Diff line number	Diff line change
`@@ -379,9 +379,11 @@ pub struct DataColumnsByRootRequest {`
`379`	`379`	`}`
`380`	`380`
`381`	`381`	`impl DataColumnsByRootRequest {`
`382`		`- pub fn new(blob_ids: Vec<DataColumnIdentifier>, spec: &ChainSpec) -> Self {`
`383`		`- let data_column_ids =`
`384`		`- RuntimeVariableList::from_vec(blob_ids, spec.max_request_data_column_sidecars as usize);`
	`382`	`+ pub fn new(data_column_ids: Vec<DataColumnIdentifier>, spec: &ChainSpec) -> Self {`
	`383`	`+ let data_column_ids = RuntimeVariableList::from_vec(`
	`384`	`+ data_column_ids,`
	`385`	`+ spec.max_request_data_column_sidecars as usize,`
	`386`	`+ );`
`385`	`387`	`Self { data_column_ids }`
`386`	`388`	`}`
`387`	`389`	`}`