add performance metrics to gossipsub (#2346)

* add performance metrics to gossipsub

* add changelog entry

* Re-export open-client-metrics crate

* Added some extra metrics

* More metrics

* Additional metrics

* Correct topic encoding

* More manual encoding of metric labels

* Clean up metric labelling

* Improve heartbeat duration buckets

* Remove re-export

Co-authored-by: Age Manning <Age@AgeManning.com>
This commit is contained in:
Divma
2021-12-21 17:31:19 -05:00
committed by GitHub
parent 379001a1d0
commit 17861d9cac
7 changed files with 539 additions and 177 deletions

View File

@ -4,9 +4,12 @@
- Migrate to Rust edition 2021 (see [PR 2339]). - Migrate to Rust edition 2021 (see [PR 2339]).
- Add metrics for network and configuration performance analysis (see [PR 2346]).
- Improve bandwidth performance by tracking IWANTs and reducing duplicate sends - Improve bandwidth performance by tracking IWANTs and reducing duplicate sends
(see [PR 2327]). (see [PR 2327]).
[PR 2346]: https://github.com/libp2p/rust-libp2p/pull/2346
[PR 2339]: https://github.com/libp2p/rust-libp2p/pull/2339 [PR 2339]: https://github.com/libp2p/rust-libp2p/pull/2339
[PR 2327]: https://github.com/libp2p/rust-libp2p/pull/2327 [PR 2327]: https://github.com/libp2p/rust-libp2p/pull/2327

View File

@ -31,7 +31,7 @@ futures-timer = "3.0.2"
pin-project = "1.0.8" pin-project = "1.0.8"
instant = "0.1.11" instant = "0.1.11"
# Metrics dependencies # Metrics dependencies
open-metrics-client = "0.13" open-metrics-client = "0.13.0"
[dev-dependencies] [dev-dependencies]
async-std = "1.6.3" async-std = "1.6.3"

View File

@ -51,7 +51,7 @@ use crate::error::{PublishError, SubscriptionError, ValidationError};
use crate::gossip_promises::GossipPromises; use crate::gossip_promises::GossipPromises;
use crate::handler::{GossipsubHandler, GossipsubHandlerIn, HandlerEvent}; use crate::handler::{GossipsubHandler, GossipsubHandlerIn, HandlerEvent};
use crate::mcache::MessageCache; use crate::mcache::MessageCache;
use crate::metrics::{Churn, Config as MetricsConfig, Inclusion, Metrics}; use crate::metrics::{Churn, Config as MetricsConfig, Inclusion, Metrics, Penalty};
use crate::peer_score::{PeerScore, PeerScoreParams, PeerScoreThresholds, RejectReason}; use crate::peer_score::{PeerScore, PeerScoreParams, PeerScoreThresholds, RejectReason};
use crate::protocol::SIGNING_PREFIX; use crate::protocol::SIGNING_PREFIX;
use crate::subscription_filter::{AllowAllSubscriptionFilter, TopicSubscriptionFilter}; use crate::subscription_filter::{AllowAllSubscriptionFilter, TopicSubscriptionFilter};
@ -635,10 +635,9 @@ where
return Err(PublishError::Duplicate); return Err(PublishError::Duplicate);
} }
debug!("Publishing message: {:?}", msg_id); trace!("Publishing message: {:?}", msg_id);
let topic_hash = raw_message.topic.clone(); let topic_hash = raw_message.topic.clone();
let msg_bytes = raw_message.data.len();
// If we are not flood publishing forward the message to mesh peers. // If we are not flood publishing forward the message to mesh peers.
let mesh_peers_sent = !self.config.flood_publish() let mesh_peers_sent = !self.config.flood_publish()
@ -732,8 +731,9 @@ where
} }
// Send to peers we know are subscribed to the topic. // Send to peers we know are subscribed to the topic.
let msg_bytes = event.encoded_len();
for peer_id in recipient_peers.iter() { for peer_id in recipient_peers.iter() {
debug!("Sending message to peer: {:?}", peer_id); trace!("Sending message to peer: {:?}", peer_id);
self.send_message(*peer_id, event.clone())?; self.send_message(*peer_id, event.clone())?;
if let Some(m) = self.metrics.as_mut() { if let Some(m) = self.metrics.as_mut() {
@ -742,6 +742,11 @@ where
} }
debug!("Published message: {:?}", &msg_id); debug!("Published message: {:?}", &msg_id);
if let Some(metrics) = self.metrics.as_mut() {
metrics.register_published_message(&topic_hash);
}
Ok(msg_id) Ok(msg_id)
} }
@ -784,6 +789,11 @@ where
return Ok(false); return Ok(false);
} }
}; };
if let Some(metrics) = self.metrics.as_mut() {
metrics.register_msg_validation(&raw_message.topic, &acceptance);
}
self.forward_msg( self.forward_msg(
msg_id, msg_id,
raw_message, raw_message,
@ -797,6 +807,10 @@ where
}; };
if let Some((raw_message, originating_peers)) = self.mcache.remove(msg_id) { if let Some((raw_message, originating_peers)) = self.mcache.remove(msg_id) {
if let Some(metrics) = self.metrics.as_mut() {
metrics.register_msg_validation(&raw_message.topic, &acceptance);
}
// Tell peer_score about reject // Tell peer_score about reject
// Reject the original source, and any duplicates we've seen from other peers. // Reject the original source, and any duplicates we've seen from other peers.
if let Some((peer_score, ..)) = &mut self.peer_score { if let Some((peer_score, ..)) = &mut self.peer_score {
@ -960,7 +974,7 @@ where
let fanaout_added = added_peers.len(); let fanaout_added = added_peers.len();
if let Some(m) = self.metrics.as_mut() { if let Some(m) = self.metrics.as_mut() {
m.peers_included(topic_hash, Inclusion::Fanaout, fanaout_added) m.peers_included(topic_hash, Inclusion::Fanout, fanaout_added)
} }
// check if we need to get more peers, which we randomly select // check if we need to get more peers, which we randomly select
@ -1191,7 +1205,7 @@ where
trace!("Handling IHAVE for peer: {:?}", peer_id); trace!("Handling IHAVE for peer: {:?}", peer_id);
// use a hashset to avoid duplicates efficiently // use a hashmap to avoid duplicates efficiently
let mut iwant_ids = HashMap::new(); let mut iwant_ids = HashMap::new();
for (topic, ids) in ihave_msgs { for (topic, ids) in ihave_msgs {
@ -1308,38 +1322,29 @@ where
// Send the messages to the peer // Send the messages to the peer
let message_list: Vec<_> = cached_messages.into_iter().map(|entry| entry.1).collect(); let message_list: Vec<_> = cached_messages.into_iter().map(|entry| entry.1).collect();
let mut topic_msgs = HashMap::<TopicHash, Vec<usize>>::default(); let topics = message_list
if self.metrics.is_some() { .iter()
for msg in message_list.iter() { .map(|message| message.topic.clone())
topic_msgs .collect::<HashSet<TopicHash>>();
.entry(msg.topic.clone())
.or_default()
.push(msg.data.len());
}
}
if self let message = GossipsubRpc {
.send_message(
*peer_id,
GossipsubRpc {
subscriptions: Vec::new(), subscriptions: Vec::new(),
messages: message_list, messages: message_list,
control_msgs: Vec::new(), control_msgs: Vec::new(),
} }
.into_protobuf(), .into_protobuf();
)
.is_err() let msg_bytes = message.encoded_len();
{
if self.send_message(*peer_id, message).is_err() {
error!("Failed to send cached messages. Messages too large"); error!("Failed to send cached messages. Messages too large");
} else if let Some(m) = self.metrics.as_mut() { } else if let Some(m) = self.metrics.as_mut() {
// Sending of messages succeeded, register them on the internal metrics. // Sending of messages succeeded, register them on the internal metrics.
for (topic, msg_bytes_vec) in topic_msgs.into_iter() { for topic in topics.iter() {
for msg_bytes in msg_bytes_vec {
m.msg_sent(&topic, msg_bytes); m.msg_sent(&topic, msg_bytes);
} }
} }
} }
}
debug!("Completed IWANT handling for peer: {}", peer_id); debug!("Completed IWANT handling for peer: {}", peer_id);
} }
@ -1391,11 +1396,14 @@ where
{ {
if backoff_time > now { if backoff_time > now {
warn!( warn!(
"GRAFT: peer attempted graft within backoff time, penalizing {}", "[Penalty] Peer attempted graft within backoff time, penalizing {}",
peer_id peer_id
); );
// add behavioural penalty // add behavioural penalty
if let Some((peer_score, ..)) = &mut self.peer_score { if let Some((peer_score, ..)) = &mut self.peer_score {
if let Some(metrics) = self.metrics.as_mut() {
metrics.register_score_penalty(Penalty::GraftBackoff);
}
peer_score.add_penalty(peer_id, 1); peer_score.add_penalty(peer_id, 1);
// check the flood cutoff // check the flood cutoff
@ -1668,15 +1676,11 @@ where
"Rejecting message from peer {} because of blacklisted source: {}", "Rejecting message from peer {} because of blacklisted source: {}",
propagation_source, source propagation_source, source
); );
if let Some((peer_score, .., gossip_promises)) = &mut self.peer_score { self.handle_invalid_message(
peer_score.reject_message(
propagation_source, propagation_source,
msg_id, raw_message,
&raw_message.topic,
RejectReason::BlackListedSource, RejectReason::BlackListedSource,
); );
gossip_promises.reject_message(msg_id, &RejectReason::BlackListedSource);
}
return false; return false;
} }
} }
@ -1702,15 +1706,7 @@ where
"Dropping message {} claiming to be from self but forwarded from {}", "Dropping message {} claiming to be from self but forwarded from {}",
msg_id, propagation_source msg_id, propagation_source
); );
if let Some((peer_score, _, _, gossip_promises)) = &mut self.peer_score { self.handle_invalid_message(propagation_source, raw_message, RejectReason::SelfOrigin);
peer_score.reject_message(
propagation_source,
msg_id,
&raw_message.topic,
RejectReason::SelfOrigin,
);
gossip_promises.reject_message(msg_id, &RejectReason::SelfOrigin);
}
return false; return false;
} }
@ -1725,6 +1721,11 @@ where
mut raw_message: RawGossipsubMessage, mut raw_message: RawGossipsubMessage,
propagation_source: &PeerId, propagation_source: &PeerId,
) { ) {
// Record the received metric
if let Some(metrics) = self.metrics.as_mut() {
metrics.msg_recvd_unfiltered(&raw_message.topic, raw_message.raw_protobuf_len());
}
let fast_message_id = self.config.fast_message_id(&raw_message); let fast_message_id = self.config.fast_message_id(&raw_message);
if let Some(fast_message_id) = fast_message_id.as_ref() { if let Some(fast_message_id) = fast_message_id.as_ref() {
@ -1758,8 +1759,8 @@ where
// Reject the message and return // Reject the message and return
self.handle_invalid_message( self.handle_invalid_message(
propagation_source, propagation_source,
raw_message, &raw_message,
ValidationError::TransformFailed, RejectReason::ValidationError(ValidationError::TransformFailed),
); );
return; return;
} }
@ -1796,6 +1797,11 @@ where
msg_id msg_id
); );
// Record the received message with the metrics
if let Some(metrics) = self.metrics.as_mut() {
metrics.msg_recvd(&message.topic);
}
// Tells score that message arrived (but is maybe not fully validated yet). // Tells score that message arrived (but is maybe not fully validated yet).
// Consider the message as delivered for gossip promises. // Consider the message as delivered for gossip promises.
if let Some((peer_score, .., gossip_promises)) = &mut self.peer_score { if let Some((peer_score, .., gossip_promises)) = &mut self.peer_score {
@ -1845,19 +1851,28 @@ where
fn handle_invalid_message( fn handle_invalid_message(
&mut self, &mut self,
propagation_source: &PeerId, propagation_source: &PeerId,
raw_message: RawGossipsubMessage, raw_message: &RawGossipsubMessage,
validation_error: ValidationError, reject_reason: RejectReason,
) { ) {
if let Some((peer_score, .., gossip_promises)) = &mut self.peer_score { if let Some((peer_score, .., gossip_promises)) = &mut self.peer_score {
let reason = RejectReason::ValidationError(validation_error); if let Some(metrics) = self.metrics.as_mut() {
metrics.register_invalid_message(&raw_message.topic);
}
let fast_message_id_cache = &self.fast_message_id_cache; let fast_message_id_cache = &self.fast_message_id_cache;
if let Some(msg_id) = self if let Some(msg_id) = self
.config .config
.fast_message_id(&raw_message) .fast_message_id(raw_message)
.and_then(|id| fast_message_id_cache.get(&id)) .and_then(|id| fast_message_id_cache.get(&id))
{ {
peer_score.reject_message(propagation_source, msg_id, &raw_message.topic, reason); peer_score.reject_message(
gossip_promises.reject_message(msg_id, &reason); propagation_source,
msg_id,
&raw_message.topic,
reject_reason,
);
gossip_promises.reject_message(msg_id, &reject_reason);
} else { } else {
// The message is invalid, we reject it ignoring any gossip promises. If a peer is // The message is invalid, we reject it ignoring any gossip promises. If a peer is
// advertising this message via an IHAVE and it's invalid it will be double // advertising this message via an IHAVE and it's invalid it will be double
@ -2067,6 +2082,9 @@ where
if let Some((peer_score, .., gossip_promises)) = &mut self.peer_score { if let Some((peer_score, .., gossip_promises)) = &mut self.peer_score {
for (peer, count) in gossip_promises.get_broken_promises() { for (peer, count) in gossip_promises.get_broken_promises() {
peer_score.add_penalty(&peer, count); peer_score.add_penalty(&peer, count);
if let Some(metrics) = self.metrics.as_mut() {
metrics.register_score_penalty(Penalty::BrokenPromise);
}
} }
} }
} }
@ -2074,6 +2092,7 @@ where
/// Heartbeat function which shifts the memcache and updates the mesh. /// Heartbeat function which shifts the memcache and updates the mesh.
fn heartbeat(&mut self) { fn heartbeat(&mut self) {
debug!("Starting heartbeat"); debug!("Starting heartbeat");
let start = Instant::now();
self.heartbeat_ticks += 1; self.heartbeat_ticks += 1;
@ -2098,13 +2117,15 @@ where
} }
} }
// cache scores throughout the heartbeat // Cache the scores of all connected peers, and record metrics for current penalties.
let mut scores = HashMap::new(); let mut scores = HashMap::with_capacity(self.connected_peers.len());
let peer_score = &self.peer_score; if let Some((peer_score, ..)) = &self.peer_score {
let mut score = |p: &PeerId| match peer_score { for peer_id in self.connected_peers.keys() {
Some((peer_score, ..)) => *scores.entry(*p).or_insert_with(|| peer_score.score(p)), scores
_ => 0.0, .entry(peer_id)
}; .or_insert_with(|| peer_score.metric_score(&peer_id, self.metrics.as_mut()));
}
}
// maintain the mesh for each topic // maintain the mesh for each topic
for (topic_hash, peers) in self.mesh.iter_mut() { for (topic_hash, peers) in self.mesh.iter_mut() {
@ -2116,35 +2137,35 @@ where
// drop all peers with negative score, without PX // drop all peers with negative score, without PX
// if there is at some point a stable retain method for BTreeSet the following can be // if there is at some point a stable retain method for BTreeSet the following can be
// written more efficiently with retain. // written more efficiently with retain.
let to_remove: Vec<_> = peers let mut to_remove_peers = Vec::new();
.iter() for peer_id in peers.iter() {
.filter(|&p| { let peer_score = *scores.get(peer_id).unwrap_or(&0.0);
if score(p) < 0.0 {
// Record the score per mesh
if let Some(metrics) = self.metrics.as_mut() {
metrics.observe_mesh_peers_score(topic_hash, peer_score);
}
if peer_score < 0.0 {
debug!( debug!(
"HEARTBEAT: Prune peer {:?} with negative score [score = {}, topic = \ "HEARTBEAT: Prune peer {:?} with negative score [score = {}, topic = \
{}]", {}]",
p, peer_id, peer_score, topic_hash
score(p),
topic_hash
); );
let current_topic = to_prune.entry(*p).or_insert_with(Vec::new); let current_topic = to_prune.entry(*peer_id).or_insert_with(Vec::new);
current_topic.push(topic_hash.clone()); current_topic.push(topic_hash.clone());
no_px.insert(*p); no_px.insert(*peer_id);
true to_remove_peers.push(*peer_id);
} else { }
false
} }
})
.cloned()
.collect();
if let Some(m) = self.metrics.as_mut() { if let Some(m) = self.metrics.as_mut() {
m.peers_removed(topic_hash, Churn::BadScore, to_remove.len()) m.peers_removed(topic_hash, Churn::BadScore, to_remove_peers.len())
} }
for peer in to_remove { for peer_id in to_remove_peers {
peers.remove(&peer); peers.remove(&peer_id);
} }
// too little peers - add some // too little peers - add some
@ -2166,7 +2187,7 @@ where
!peers.contains(peer) !peers.contains(peer)
&& !explicit_peers.contains(peer) && !explicit_peers.contains(peer)
&& !backoffs.is_backoff_with_slack(topic_hash, peer) && !backoffs.is_backoff_with_slack(topic_hash, peer)
&& score(peer) >= 0.0 && *scores.get(peer).unwrap_or(&0.0) >= 0.0
}, },
); );
for peer in &peer_list { for peer in &peer_list {
@ -2195,8 +2216,12 @@ where
let mut rng = thread_rng(); let mut rng = thread_rng();
let mut shuffled = peers.iter().cloned().collect::<Vec<_>>(); let mut shuffled = peers.iter().cloned().collect::<Vec<_>>();
shuffled.shuffle(&mut rng); shuffled.shuffle(&mut rng);
shuffled shuffled.sort_by(|p1, p2| {
.sort_by(|p1, p2| score(p1).partial_cmp(&score(p2)).unwrap_or(Ordering::Equal)); let score_p1 = *scores.get(p1).unwrap_or(&0.0);
let score_p2 = *scores.get(p2).unwrap_or(&0.0);
score_p1.partial_cmp(&score_p2).unwrap_or(Ordering::Equal)
});
// shuffle everything except the last retain_scores many peers (the best ones) // shuffle everything except the last retain_scores many peers (the best ones)
shuffled[..peers.len() - self.config.retain_scores()].shuffle(&mut rng); shuffled[..peers.len() - self.config.retain_scores()].shuffle(&mut rng);
@ -2255,7 +2280,7 @@ where
!peers.contains(peer) !peers.contains(peer)
&& !explicit_peers.contains(peer) && !explicit_peers.contains(peer)
&& !backoffs.is_backoff_with_slack(topic_hash, peer) && !backoffs.is_backoff_with_slack(topic_hash, peer)
&& score(peer) >= 0.0 && *scores.get(peer).unwrap_or(&0.0) >= 0.0
&& outbound_peers.contains(peer) && outbound_peers.contains(peer)
}, },
); );
@ -2288,19 +2313,27 @@ where
// now compute the median peer score in the mesh // now compute the median peer score in the mesh
let mut peers_by_score: Vec<_> = peers.iter().collect(); let mut peers_by_score: Vec<_> = peers.iter().collect();
peers_by_score peers_by_score.sort_by(|p1, p2| {
.sort_by(|p1, p2| score(p1).partial_cmp(&score(p2)).unwrap_or(Equal)); let p1_score = *scores.get(p1).unwrap_or(&0.0);
let p2_score = *scores.get(p2).unwrap_or(&0.0);
p1_score.partial_cmp(&p2_score).unwrap_or(Equal)
});
let middle = peers_by_score.len() / 2; let middle = peers_by_score.len() / 2;
let median = if peers_by_score.len() % 2 == 0 { let median = if peers_by_score.len() % 2 == 0 {
(score( let sub_middle_peer = *peers_by_score
*peers_by_score.get(middle - 1).expect( .get(middle - 1)
"middle < vector length and middle > 0 since peers.len() > 0", .expect("middle < vector length and middle > 0 since peers.len() > 0");
), let sub_middle_score = *scores.get(sub_middle_peer).unwrap_or(&0.0);
) + score(*peers_by_score.get(middle).expect("middle < vector length"))) let middle_peer =
* 0.5 *peers_by_score.get(middle).expect("middle < vector length");
let middle_score = *scores.get(middle_peer).unwrap_or(&0.0);
(sub_middle_score + middle_score) * 0.5
} else { } else {
score(*peers_by_score.get(middle).expect("middle < vector length")) *scores
.get(*peers_by_score.get(middle).expect("middle < vector length"))
.unwrap_or(&0.0)
}; };
// if the median score is below the threshold, select a better peer (if any) and // if the median score is below the threshold, select a better peer (if any) and
@ -2311,11 +2344,11 @@ where
&self.connected_peers, &self.connected_peers,
topic_hash, topic_hash,
self.config.opportunistic_graft_peers(), self.config.opportunistic_graft_peers(),
|peer| { |peer_id| {
!peers.contains(peer) !peers.contains(peer_id)
&& !explicit_peers.contains(peer) && !explicit_peers.contains(peer_id)
&& !backoffs.is_backoff_with_slack(topic_hash, peer) && !backoffs.is_backoff_with_slack(topic_hash, peer_id)
&& score(peer) > median && *scores.get(peer_id).unwrap_or(&0.0) > median
}, },
); );
for peer in &peer_list { for peer in &peer_list {
@ -2367,9 +2400,10 @@ where
}; };
for peer in peers.iter() { for peer in peers.iter() {
// is the peer still subscribed to the topic? // is the peer still subscribed to the topic?
let peer_score = *scores.get(peer).unwrap_or(&0.0);
match self.peer_topics.get(peer) { match self.peer_topics.get(peer) {
Some(topics) => { Some(topics) => {
if !topics.contains(topic_hash) || score(peer) < publish_threshold { if !topics.contains(topic_hash) || peer_score < publish_threshold {
debug!( debug!(
"HEARTBEAT: Peer removed from fanout for topic: {:?}", "HEARTBEAT: Peer removed from fanout for topic: {:?}",
topic_hash topic_hash
@ -2401,10 +2435,10 @@ where
&self.connected_peers, &self.connected_peers,
topic_hash, topic_hash,
needed_peers, needed_peers,
|peer| { |peer_id| {
!peers.contains(peer) !peers.contains(peer_id)
&& !explicit_peers.contains(peer) && !explicit_peers.contains(peer_id)
&& score(peer) < publish_threshold && *scores.get(peer_id).unwrap_or(&0.0) < publish_threshold
}, },
); );
peers.extend(new_peers); peers.extend(new_peers);
@ -2412,12 +2446,6 @@ where
} }
if self.peer_score.is_some() { if self.peer_score.is_some() {
trace!("Peer_scores: {:?}", {
for peer in self.peer_topics.keys() {
score(peer);
}
scores
});
trace!("Mesh message deliveries: {:?}", { trace!("Mesh message deliveries: {:?}", {
self.mesh self.mesh
.iter() .iter()
@ -2429,7 +2457,7 @@ where
.map(|p| { .map(|p| {
( (
*p, *p,
peer_score self.peer_score
.as_ref() .as_ref()
.expect("peer_score.is_some()") .expect("peer_score.is_some()")
.0 .0
@ -2458,6 +2486,10 @@ where
self.mcache.shift(); self.mcache.shift();
debug!("Completed Heartbeat"); debug!("Completed Heartbeat");
if let Some(metrics) = self.metrics.as_mut() {
let duration = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX);
metrics.observe_heartbeat_duration(duration);
}
} }
/// Emits gossip - Send IHAVE messages to a random set of gossip peers. This is applied to mesh /// Emits gossip - Send IHAVE messages to a random set of gossip peers. This is applied to mesh
@ -2695,11 +2727,12 @@ where
} }
.into_protobuf(); .into_protobuf();
let msg_bytes = event.encoded_len();
for peer in recipient_peers.iter() { for peer in recipient_peers.iter() {
debug!("Sending message: {:?} to peer {:?}", msg_id, peer); debug!("Sending message: {:?} to peer {:?}", msg_id, peer);
self.send_message(*peer, event.clone())?; self.send_message(*peer, event.clone())?;
if let Some(m) = self.metrics.as_mut() { if let Some(m) = self.metrics.as_mut() {
m.msg_sent(&message.topic, message.data.len()); m.msg_sent(&message.topic, msg_bytes);
} }
} }
debug!("Completed forwarding message"); debug!("Completed forwarding message");
@ -3110,6 +3143,17 @@ where
// NOTE: It is possible the peer has already been removed from all mappings if it does not // NOTE: It is possible the peer has already been removed from all mappings if it does not
// support the protocol. // support the protocol.
self.peer_topics.remove(peer_id); self.peer_topics.remove(peer_id);
// If metrics are enabled, register the disconnection of a peer based on its protocol.
if let Some(metrics) = self.metrics.as_mut() {
let peer_kind = &self
.connected_peers
.get(peer_id)
.expect("Connected peer must be registered")
.kind;
metrics.peer_protocol_disconnected(peer_kind.clone());
}
self.connected_peers.remove(peer_id); self.connected_peers.remove(peer_id);
if let Some((peer_score, ..)) = &mut self.peer_score { if let Some((peer_score, ..)) = &mut self.peer_score {
@ -3257,6 +3301,11 @@ where
match handler_event { match handler_event {
HandlerEvent::PeerKind(kind) => { HandlerEvent::PeerKind(kind) => {
// We have identified the protocol this peer is using // We have identified the protocol this peer is using
if let Some(metrics) = self.metrics.as_mut() {
metrics.peer_protocol_connected(kind.clone());
}
if let PeerKind::NotSupported = kind { if let PeerKind::NotSupported = kind {
debug!( debug!(
"Peer does not support gossipsub protocols. {}", "Peer does not support gossipsub protocols. {}",
@ -3304,8 +3353,8 @@ where
for (raw_message, validation_error) in invalid_messages { for (raw_message, validation_error) in invalid_messages {
self.handle_invalid_message( self.handle_invalid_message(
&propagation_source, &propagation_source,
raw_message, &raw_message,
validation_error, RejectReason::ValidationError(validation_error),
) )
} }
} else { } else {

View File

@ -25,11 +25,13 @@ use std::collections::HashMap;
use open_metrics_client::encoding::text::Encode; use open_metrics_client::encoding::text::Encode;
use open_metrics_client::metrics::counter::Counter; use open_metrics_client::metrics::counter::Counter;
use open_metrics_client::metrics::family::Family; use open_metrics_client::metrics::family::{Family, MetricConstructor};
use open_metrics_client::metrics::gauge::Gauge; use open_metrics_client::metrics::gauge::Gauge;
use open_metrics_client::metrics::histogram::{linear_buckets, Histogram};
use open_metrics_client::registry::Registry; use open_metrics_client::registry::Registry;
use crate::topic::TopicHash; use crate::topic::TopicHash;
use crate::types::{MessageAcceptance, PeerKind};
// Default value that limits for how many topics do we store metrics. // Default value that limits for how many topics do we store metrics.
const DEFAULT_MAX_TOPICS: usize = 300; const DEFAULT_MAX_TOPICS: usize = 300;
@ -60,48 +62,6 @@ impl Default for Config {
/// Whether we have ever been subscribed to this topic. /// Whether we have ever been subscribed to this topic.
type EverSubscribed = bool; type EverSubscribed = bool;
/// Reasons why a peer was included in the mesh.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
pub enum Inclusion {
/// Peer was a fanaout peer.
Fanaout,
/// Included from random selection.
Random,
/// Peer subscribed.
Subscribed,
/// Peer was included to fill the outbound quota.
Outbound,
}
/// Reasons why a peer was removed from the mesh.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
pub enum Churn {
/// Peer disconnected.
Dc,
/// Peer had a bad score.
BadScore,
/// Peer sent a PRUNE.
Prune,
/// Peer unsubscribed.
Unsub,
/// Too many peers.
Excess,
}
/// Label for the mesh inclusion event metrics.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
struct InclusionLabel {
topic: TopicHash,
reason: Inclusion,
}
/// Label for the mesh churn event metrics.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
struct ChurnLabel {
topic: TopicHash,
reason: Churn,
}
/// A collection of metrics used throughout the Gossipsub behaviour. /// A collection of metrics used throughout the Gossipsub behaviour.
pub struct Metrics { pub struct Metrics {
/* Configuration parameters */ /* Configuration parameters */
@ -123,6 +83,14 @@ pub struct Metrics {
/// Number of peers subscribed to each topic. This allows us to analyze a topic's behaviour /// Number of peers subscribed to each topic. This allows us to analyze a topic's behaviour
/// regardless of our subscription status. /// regardless of our subscription status.
topic_peers_count: Family<TopicHash, Gauge>, topic_peers_count: Family<TopicHash, Gauge>,
/// The number of invalid messages received for a given topic.
invalid_messages: Family<TopicHash, Counter>,
/// The number of messages accepted by the application (validation result).
accepted_messages: Family<TopicHash, Counter>,
/// The number of messages ignored by the application (validation result).
ignored_messages: Family<TopicHash, Counter>,
/// The number of messages rejected by the application (validation result).
rejected_messages: Family<TopicHash, Counter>,
/* Metrics regarding mesh state */ /* Metrics regarding mesh state */
/// Number of peers in our mesh. This metric should be updated with the count of peers for a /// Number of peers in our mesh. This metric should be updated with the count of peers for a
@ -133,11 +101,43 @@ pub struct Metrics {
/// Number of times we remove peers in a topic mesh for different reasons. /// Number of times we remove peers in a topic mesh for different reasons.
mesh_peer_churn_events: Family<ChurnLabel, Counter>, mesh_peer_churn_events: Family<ChurnLabel, Counter>,
/* Metrics regarding messages sent */ /* Metrics regarding messages sent/received */
/// Number of gossip messages sent to each topic. /// Number of gossip messages sent to each topic.
topic_msg_sent_counts: Family<TopicHash, Counter>, topic_msg_sent_counts: Family<TopicHash, Counter>,
/// Bytes from gossip messages sent to each topic . /// Bytes from gossip messages sent to each topic.
topic_msg_sent_bytes: Family<TopicHash, Counter>, topic_msg_sent_bytes: Family<TopicHash, Counter>,
/// Number of gossipsub messages published to each topic.
topic_msg_published: Family<TopicHash, Counter>,
/// Number of gossipsub messages received on each topic (without filtering duplicates).
topic_msg_recv_counts_unfiltered: Family<TopicHash, Counter>,
/// Number of gossipsub messages received on each topic (after filtering duplicates).
topic_msg_recv_counts: Family<TopicHash, Counter>,
/// Bytes received from gossip messages for each topic.
topic_msg_recv_bytes: Family<TopicHash, Counter>,
/* Metrics related to scoring */
/// Histogram of the scores for each mesh topic.
score_per_mesh: Family<TopicHash, Histogram, HistBuilder>,
/// A counter of the kind of penalties being applied to peers.
scoring_penalties: Family<PenaltyLabel, Counter>,
/* General Metrics */
/// Gossipsub supports floodsub, gossipsub v1.0 and gossipsub v1.1. Peers are classified based
/// on which protocol they support. This metric keeps track of the number of peers that are
/// connected of each type.
peers_per_protocol: Family<ProtocolLabel, Gauge>,
/// The time it takes to complete one iteration of the heartbeat.
heartbeat_duration: Histogram,
/* Performance metrics */
/// When the user validates a message, it tries to re propagate it to its mesh peers. If the
/// message expires from the memcache before it can be validated, we count this a cache miss
/// and it is an indicator that the memcache size should be increased.
memcache_misses: Counter,
/// The number of times we have decided that an IWANT control message is required for this
/// topic. A very high metric might indicate an underperforming network.
topic_iwant_msgs: Family<TopicHash, Counter>,
} }
impl Metrics { impl Metrics {
@ -165,6 +165,26 @@ impl Metrics {
"Number of peers subscribed to each topic" "Number of peers subscribed to each topic"
); );
let invalid_messages = register_family!(
"invalid_messages_per_topic",
"Number of invalid messages received for each topic"
);
let accepted_messages = register_family!(
"accepted_messages_per_topic",
"Number of accepted messages received for each topic"
);
let ignored_messages = register_family!(
"ignored_messages_per_topic",
"Number of ignored messages received for each topic"
);
let rejected_messages = register_family!(
"accepted_messages_per_topic",
"Number of rejected messages received for each topic"
);
let mesh_peer_counts = register_family!( let mesh_peer_counts = register_family!(
"mesh_peer_counts", "mesh_peer_counts",
"Number of peers in each topic in our mesh" "Number of peers in each topic in our mesh"
@ -177,27 +197,114 @@ impl Metrics {
"mesh_peer_churn_events", "mesh_peer_churn_events",
"Number of times a peer gets removed from our mesh for different reasons" "Number of times a peer gets removed from our mesh for different reasons"
); );
let topic_msg_sent_counts = register_family!( let topic_msg_sent_counts = register_family!(
"topic_msg_sent_counts", "topic_msg_sent_counts",
"Number of gossip messages sent to each topic." "Number of gossip messages sent to each topic"
);
let topic_msg_published = register_family!(
"topic_msg_published",
"Number of gossip messages published to each topic"
); );
let topic_msg_sent_bytes = register_family!( let topic_msg_sent_bytes = register_family!(
"topic_msg_sent_bytes", "topic_msg_sent_bytes",
"Bytes from gossip messages sent to each topic." "Bytes from gossip messages sent to each topic"
); );
let topic_msg_recv_counts_unfiltered = register_family!(
"topic_msg_recv_counts_unfiltered",
"Number of gossip messages received on each topic (without duplicates being filtered)"
);
let topic_msg_recv_counts = register_family!(
"topic_msg_recv_counts",
"Number of gossip messages received on each topic (after duplicates have been filtered)"
);
let topic_msg_recv_bytes = register_family!(
"topic_msg_recv_bytes",
"Bytes received from gossip messages for each topic"
);
// TODO: Update default variables once a builder pattern is used.
let gossip_threshold = -4000.0;
let publish_threshold = -8000.0;
let greylist_threshold = -16000.0;
let histogram_buckets: Vec<f64> = vec![
greylist_threshold,
publish_threshold,
gossip_threshold,
gossip_threshold / 2.0,
gossip_threshold / 4.0,
0.0,
1.0,
10.0,
100.0,
];
let hist_builder = HistBuilder {
buckets: histogram_buckets,
};
let score_per_mesh: Family<_, _, HistBuilder> = Family::new_with_constructor(hist_builder);
registry.register(
"score_per_mesh",
"Histogram of scores per mesh topic",
Box::new(score_per_mesh.clone()),
);
let scoring_penalties = register_family!(
"scoring_penalties",
"Counter of types of scoring penalties given to peers"
);
let peers_per_protocol = register_family!(
"peers_per_protocol",
"Number of connected peers by protocol type"
);
let heartbeat_duration = Histogram::new(linear_buckets(0.0, 50.0, 10));
registry.register(
"heartbeat_duration",
"Histogram of observed heartbeat durations",
Box::new(heartbeat_duration.clone()),
);
let topic_iwant_msgs = register_family!(
"topic_iwant_msgs",
"Number of times we have decided an IWANT is required for this topic"
);
let memcache_misses = {
let metric = Counter::default();
registry.register(
"memcache_misses",
"Number of times a message is not found in the duplicate cache when validating",
Box::new(metric.clone()),
);
metric
};
Self { Self {
max_topics, max_topics,
max_never_subscribed_topics, max_never_subscribed_topics,
topic_info: HashMap::default(), topic_info: HashMap::default(),
topic_subscription_status, topic_subscription_status,
topic_peers_count, topic_peers_count,
invalid_messages,
accepted_messages,
ignored_messages,
rejected_messages,
mesh_peer_counts, mesh_peer_counts,
mesh_peer_inclusion_events, mesh_peer_inclusion_events,
mesh_peer_churn_events, mesh_peer_churn_events,
topic_msg_sent_counts, topic_msg_sent_counts,
topic_msg_sent_bytes, topic_msg_sent_bytes,
topic_msg_published,
topic_msg_recv_counts_unfiltered,
topic_msg_recv_counts,
topic_msg_recv_bytes,
score_per_mesh,
scoring_penalties,
peers_per_protocol,
heartbeat_duration,
memcache_misses,
topic_iwant_msgs,
} }
} }
@ -265,7 +372,7 @@ impl Metrics {
if self.register_topic(topic).is_ok() { if self.register_topic(topic).is_ok() {
self.mesh_peer_inclusion_events self.mesh_peer_inclusion_events
.get_or_create(&InclusionLabel { .get_or_create(&InclusionLabel {
topic: topic.clone(), hash: topic.to_string(),
reason, reason,
}) })
.inc_by(count as u64); .inc_by(count as u64);
@ -277,7 +384,7 @@ impl Metrics {
if self.register_topic(topic).is_ok() { if self.register_topic(topic).is_ok() {
self.mesh_peer_churn_events self.mesh_peer_churn_events
.get_or_create(&ChurnLabel { .get_or_create(&ChurnLabel {
topic: topic.clone(), hash: topic.to_string(),
reason, reason,
}) })
.inc_by(count as u64); .inc_by(count as u64);
@ -292,6 +399,27 @@ impl Metrics {
} }
} }
/// Register that an invalid message was received on a specific topic.
pub fn register_invalid_message(&mut self, topic: &TopicHash) {
if self.register_topic(topic).is_ok() {
self.invalid_messages.get_or_create(topic).inc();
}
}
/// Register a score penalty.
pub fn register_score_penalty(&mut self, penalty: Penalty) {
self.scoring_penalties
.get_or_create(&PenaltyLabel { penalty })
.inc();
}
/// Registers that a message was published on a specific topic.
pub fn register_published_message(&mut self, topic: &TopicHash) {
if self.register_topic(topic).is_ok() {
self.topic_msg_published.get_or_create(topic).inc();
}
}
/// Register sending a message over a topic. /// Register sending a message over a topic.
pub fn msg_sent(&mut self, topic: &TopicHash, bytes: usize) { pub fn msg_sent(&mut self, topic: &TopicHash, bytes: usize) {
if self.register_topic(topic).is_ok() { if self.register_topic(topic).is_ok() {
@ -301,4 +429,155 @@ impl Metrics {
.inc_by(bytes as u64); .inc_by(bytes as u64);
} }
} }
/// Register that a message was received (and was not a duplicate).
pub fn msg_recvd(&mut self, topic: &TopicHash) {
if self.register_topic(topic).is_ok() {
self.topic_msg_recv_counts.get_or_create(topic).inc();
}
}
/// Register that a message was received (could have been a duplicate).
pub fn msg_recvd_unfiltered(&mut self, topic: &TopicHash, bytes: usize) {
if self.register_topic(topic).is_ok() {
self.topic_msg_recv_counts_unfiltered
.get_or_create(topic)
.inc();
self.topic_msg_recv_bytes
.get_or_create(topic)
.inc_by(bytes as u64);
}
}
pub fn register_msg_validation(&mut self, topic: &TopicHash, validation: &MessageAcceptance) {
if self.register_topic(topic).is_ok() {
match validation {
MessageAcceptance::Accept => self.accepted_messages.get_or_create(topic).inc(),
MessageAcceptance::Ignore => self.ignored_messages.get_or_create(topic).inc(),
MessageAcceptance::Reject => self.rejected_messages.get_or_create(topic).inc(),
};
}
}
/// Register a memcache miss.
pub fn memcache_miss(&mut self) {
self.memcache_misses.inc();
}
/// Register sending an IWANT msg for this topic.
pub fn register_iwant(&mut self, topic: &TopicHash) {
if self.register_topic(topic).is_ok() {
self.topic_iwant_msgs.get_or_create(topic).inc();
}
}
/// Observes a heartbeat duration.
pub fn observe_heartbeat_duration(&mut self, millis: u64) {
self.heartbeat_duration.observe(millis as f64);
}
/// Observe a score of a mesh peer.
pub fn observe_mesh_peers_score(&mut self, topic: &TopicHash, score: f64) {
if self.register_topic(topic).is_ok() {
self.score_per_mesh.get_or_create(topic).observe(score);
}
}
/// Register a new peers connection based on its protocol.
pub fn peer_protocol_connected(&mut self, kind: PeerKind) {
self.peers_per_protocol
.get_or_create(&ProtocolLabel { protocol: kind })
.inc();
}
/// Removes a peer from the counter based on its protocol when it disconnects.
pub fn peer_protocol_disconnected(&mut self, kind: PeerKind) {
let metric = self
.peers_per_protocol
.get_or_create(&ProtocolLabel { protocol: kind });
if metric.get() == 0 {
return;
} else {
// decrement the counter
metric.set(metric.get() - 1);
}
}
}
/// Reasons why a peer was included in the mesh.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
pub enum Inclusion {
/// Peer was a fanaout peer.
Fanout,
/// Included from random selection.
Random,
/// Peer subscribed.
Subscribed,
/// Peer was included to fill the outbound quota.
Outbound,
}
/// Reasons why a peer was removed from the mesh.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
pub enum Churn {
/// Peer disconnected.
Dc,
/// Peer had a bad score.
BadScore,
/// Peer sent a PRUNE.
Prune,
/// Peer unsubscribed.
Unsub,
/// Too many peers.
Excess,
}
/// Kinds of reasons a peer's score has been penalized
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
pub enum Penalty {
/// A peer grafted before waiting the back-off time.
GraftBackoff,
/// A Peer did not respond to an IWANT request in time.
BrokenPromise,
/// A Peer did not send enough messages as expected.
MessageDeficit,
/// Too many peers under one IP address.
IPColocation,
}
/// Label for the mesh inclusion event metrics.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
struct InclusionLabel {
hash: String,
reason: Inclusion,
}
/// Label for the mesh churn event metrics.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
struct ChurnLabel {
hash: String,
reason: Churn,
}
/// Label for the kinds of protocols peers can connect as.
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
struct ProtocolLabel {
protocol: PeerKind,
}
/// Label for the kinds of scoring penalties that can occur
#[derive(PartialEq, Eq, Hash, Encode, Clone)]
struct PenaltyLabel {
penalty: Penalty,
}
#[derive(Clone)]
struct HistBuilder {
buckets: Vec<f64>,
}
impl MetricConstructor<Histogram> for HistBuilder {
fn new_metric(&self) -> Histogram {
Histogram::new(self.buckets.clone().into_iter())
}
} }

View File

@ -21,6 +21,7 @@
//! //!
//! Manages and stores the Scoring logic of a particular peer on the gossipsub behaviour. //! Manages and stores the Scoring logic of a particular peer on the gossipsub behaviour.
use crate::metrics::{Metrics, Penalty};
use crate::time_cache::TimeCache; use crate::time_cache::TimeCache;
use crate::{MessageId, TopicHash}; use crate::{MessageId, TopicHash};
use instant::Instant; use instant::Instant;
@ -212,8 +213,14 @@ impl PeerScore {
} }
} }
/// Returns the score for a peer. /// Returns the score for a peer
pub fn score(&self, peer_id: &PeerId) -> f64 { pub fn score(&self, peer_id: &PeerId) -> f64 {
self.metric_score(peer_id, None)
}
/// Returns the score for a peer, logging metrics. This is called from the heartbeat and
/// increments the metric counts for penalties.
pub fn metric_score(&self, peer_id: &PeerId, mut metrics: Option<&mut Metrics>) -> f64 {
let peer_stats = match self.peer_stats.get(peer_id) { let peer_stats = match self.peer_stats.get(peer_id) {
Some(v) => v, Some(v) => v,
None => return 0.0, None => return 0.0,
@ -264,6 +271,9 @@ impl PeerScore {
- topic_stats.mesh_message_deliveries; - topic_stats.mesh_message_deliveries;
let p3 = deficit * deficit; let p3 = deficit * deficit;
topic_score += p3 * topic_params.mesh_message_deliveries_weight; topic_score += p3 * topic_params.mesh_message_deliveries_weight;
if let Some(metrics) = metrics.as_mut() {
metrics.register_score_penalty(Penalty::MessageDeficit);
}
debug!( debug!(
"[Penalty] The peer {} has a mesh message deliveries deficit of {} in topic\ "[Penalty] The peer {} has a mesh message deliveries deficit of {} in topic\
{} and will get penalized by {}", {} and will get penalized by {}",
@ -313,6 +323,9 @@ impl PeerScore {
if (peers_in_ip as f64) > self.params.ip_colocation_factor_threshold { if (peers_in_ip as f64) > self.params.ip_colocation_factor_threshold {
let surplus = (peers_in_ip as f64) - self.params.ip_colocation_factor_threshold; let surplus = (peers_in_ip as f64) - self.params.ip_colocation_factor_threshold;
let p6 = surplus * surplus; let p6 = surplus * surplus;
if let Some(metrics) = metrics.as_mut() {
metrics.register_score_penalty(Penalty::IPColocation);
}
debug!( debug!(
"[Penalty] The peer {} gets penalized because of too many peers with the ip {}. \ "[Penalty] The peer {} gets penalized because of too many peers with the ip {}. \
The surplus is {}. ", The surplus is {}. ",
@ -600,6 +613,7 @@ impl PeerScore {
"[Penalty] Message from {} rejected because of ValidationError or SelfOrigin", "[Penalty] Message from {} rejected because of ValidationError or SelfOrigin",
from from
); );
self.mark_invalid_message_delivery(from, topic_hash); self.mark_invalid_message_delivery(from, topic_hash);
} }

View File

@ -22,6 +22,8 @@
use crate::rpc_proto; use crate::rpc_proto;
use crate::TopicHash; use crate::TopicHash;
use libp2p_core::{connection::ConnectionId, PeerId}; use libp2p_core::{connection::ConnectionId, PeerId};
use open_metrics_client::encoding::text::Encode;
use prost::Message;
use std::fmt; use std::fmt;
use std::fmt::Debug; use std::fmt::Debug;
@ -89,7 +91,7 @@ pub struct PeerConnections {
} }
/// Describes the types of peers that can exist in the gossipsub context. /// Describes the types of peers that can exist in the gossipsub context.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Hash, Encode, Eq)]
pub enum PeerKind { pub enum PeerKind {
/// A gossipsub 1.1 peer. /// A gossipsub 1.1 peer.
Gossipsubv1_1, Gossipsubv1_1,
@ -126,6 +128,21 @@ pub struct RawGossipsubMessage {
pub validated: bool, pub validated: bool,
} }
impl RawGossipsubMessage {
/// Calculates the encoded length of this message (used for calculating metrics).
pub fn raw_protobuf_len(&self) -> usize {
let message = rpc_proto::Message {
from: self.source.map(|m| m.to_bytes()),
data: Some(self.data.clone()),
seqno: self.sequence_number.map(|s| s.to_be_bytes().to_vec()),
topic: TopicHash::into_string(self.topic.clone()),
signature: self.signature.clone(),
key: self.key.clone(),
};
message.encoded_len()
}
}
/// The message sent to the user after a [`RawGossipsubMessage`] has been transformed by a /// The message sent to the user after a [`RawGossipsubMessage`] has been transformed by a
/// [`crate::DataTransform`]. /// [`crate::DataTransform`].
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]

View File

@ -37,7 +37,7 @@ use libp2p_gossipsub::{
ValidationMode, ValidationMode,
}; };
use libp2p_plaintext::PlainText2Config; use libp2p_plaintext::PlainText2Config;
use libp2p_swarm::{dial_opts::DialOpts, Swarm, SwarmEvent}; use libp2p_swarm::{Swarm, SwarmEvent};
use libp2p_yamux as yamux; use libp2p_yamux as yamux;
struct Graph { struct Graph {