feat(ping): don't close connections upon failures

Previously, the `libp2p-ping` module came with a policy to close a connection after X failed pings. This is only one of many possible policies on how users would want to do connection management.

We remove this policy without a replacement. If users wish to restore this functionality, they can easily implement such policy themselves: The default value of `max_failures` was 1. To restore the previous functionality users can simply close the connection upon the first received ping error.

In this same patch, we also simplify the API of `ping::Event` by removing the layer of `ping::Success` and instead reporting the RTT to the peer directly.

Related: #3591.

Pull-Request: #3947.
This commit is contained in:
Thomas Eizinger 2023-05-24 14:33:18 +02:00 committed by GitHub
parent a5cd0d0e03
commit 25bc30f07e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 144 additions and 196 deletions

View File

@ -249,7 +249,8 @@ async fn main() -> Result<(), Box<dyn Error>> {
match event {
ping::Event {
peer,
result: Result::Ok(ping::Success::Ping { rtt }),
result: Result::Ok(rtt),
..
} => {
println!(
"ping: rtt to {} is {} ms",
@ -257,27 +258,24 @@ async fn main() -> Result<(), Box<dyn Error>> {
rtt.as_millis()
);
}
ping::Event {
peer,
result: Result::Ok(ping::Success::Pong),
} => {
println!("ping: pong from {}", peer.to_base58());
}
ping::Event {
peer,
result: Result::Err(ping::Failure::Timeout),
..
} => {
println!("ping: timeout to {}", peer.to_base58());
}
ping::Event {
peer,
result: Result::Err(ping::Failure::Unsupported),
..
} => {
println!("ping: {} does not support ping protocol", peer.to_base58());
}
ping::Event {
peer,
result: Result::Err(ping::Failure::Other { error }),
..
} => {
println!("ping: ping::Failure with {}: {error}", peer.to_base58());
}

View File

@ -105,7 +105,8 @@ async fn main() {
}
SwarmEvent::Behaviour(MyBehaviourEvent::Ping(ping::Event {
peer,
result: Ok(ping::Success::Ping { rtt }),
result: Ok(rtt),
..
})) if peer != rendezvous_point => {
log::info!("Ping to {} is {}ms", peer, rtt.as_millis())
}

View File

@ -106,7 +106,8 @@ async fn main() {
}
SwarmEvent::Behaviour(MyBehaviourEvent::Ping(ping::Event {
peer,
result: Ok(ping::Success::Ping { rtt }),
result: Ok(rtt),
..
})) if peer != rendezvous_point => {
log::info!("Ping to {} is {}ms", peer, rtt.as_millis())
}

View File

@ -104,7 +104,8 @@ async fn main() {
}
SwarmEvent::Behaviour(MyBehaviourEvent::Ping(ping::Event {
peer,
result: Ok(ping::Success::Ping { rtt }),
result: Ok(rtt),
..
})) if peer != rendezvous_point => {
log::info!("Ping to {} is {}ms", peer, rtt.as_millis())
}

View File

@ -137,8 +137,8 @@ async fn main() -> Result<()> {
let rtt = loop {
if let Some(SwarmEvent::Behaviour(BehaviourEvent::Ping(ping::Event {
peer: _,
result: Ok(ping::Success::Ping { rtt }),
result: Ok(rtt),
..
}))) = swarm.next().await
{
log::info!("Ping successful: {rtt:?}");

View File

@ -26,9 +26,13 @@
Note that you can use the `_count` metric of the `Histogram` as a replacement for the `Counter`.
See [PR 3927].
- Remove the `pong_received` counter because it is no longer exposed by `libp2p-ping`.
See [PR 3947].
[PR 3715]: https://github.com/libp2p/rust-libp2p/pull/3715
[PR 3927]: https://github.com/libp2p/rust-libp2p/pull/3927
[PR 3325]: https://github.com/libp2p/rust-libp2p/pull/3325
[PR 3947]: https://github.com/libp2p/rust-libp2p/pull/3947
## 0.12.0

View File

@ -55,7 +55,6 @@ enum Failure {
pub(crate) struct Metrics {
rtt: Histogram,
failure: Family<FailureLabels, Counter>,
pong_received: Counter,
}
impl Metrics {
@ -77,28 +76,14 @@ impl Metrics {
failure.clone(),
);
let pong_received = Counter::default();
sub_registry.register(
"pong_received",
"Number of 'pong's received",
pong_received.clone(),
);
Self {
rtt,
failure,
pong_received,
}
Self { rtt, failure }
}
}
impl super::Recorder<libp2p_ping::Event> for Metrics {
fn record(&self, event: &libp2p_ping::Event) {
match &event.result {
Ok(libp2p_ping::Success::Pong) => {
self.pong_received.inc();
}
Ok(libp2p_ping::Success::Ping { rtt }) => {
Ok(rtt) => {
self.rtt.observe(rtt.as_secs_f64());
}
Err(failure) => {

View File

@ -2,10 +2,17 @@
- Raise MSRV to 1.65.
See [PR 3715].
- Remove deprecated items. See [PR 3702].
- Don't close connections on ping failures.
To restore the previous behaviour, users should call `Swarm::close_connection` upon receiving a `ping::Event` with a `ping::Failure`.
This also removes the `max_failures` config option.
See [PR 3947].
[PR 3715]: https://github.com/libp2p/rust-libp2p/pull/3715
[PR 3702]: https://github.com/libp2p/rust-libp2p/pull/3702
[PR 3947]: https://github.com/libp2p/rust-libp2p/pull/3947
## 0.42.0

View File

@ -19,10 +19,11 @@
// DEALINGS IN THE SOFTWARE.
use crate::{protocol, PROTOCOL_NAME};
use futures::future::BoxFuture;
use futures::future::{BoxFuture, Either};
use futures::prelude::*;
use futures_timer::Delay;
use libp2p_core::upgrade::ReadyUpgrade;
use libp2p_identity::PeerId;
use libp2p_swarm::handler::{
ConnectionEvent, DialUpgradeError, FullyNegotiatedInbound, FullyNegotiatedOutbound,
};
@ -34,7 +35,6 @@ use std::collections::VecDeque;
use std::{
error::Error,
fmt, io,
num::NonZeroU32,
task::{Context, Poll},
time::Duration,
};
@ -45,13 +45,8 @@ use void::Void;
pub struct Config {
/// The timeout of an outbound ping.
timeout: Duration,
/// The duration between the last successful outbound or inbound ping
/// and the next outbound ping.
/// The duration between outbound pings.
interval: Duration,
/// The maximum number of failed outbound pings before the associated
/// connection is deemed unhealthy, indicating to the `Swarm` that it
/// should be closed.
max_failures: NonZeroU32,
}
impl Config {
@ -59,23 +54,16 @@ impl Config {
///
/// * [`Config::with_interval`] 15s
/// * [`Config::with_timeout`] 20s
/// * [`Config::with_max_failures`] 1
///
/// These settings have the following effect:
///
/// * A ping is sent every 15 seconds on a healthy connection.
/// * Every ping sent must yield a response within 20 seconds in order to
/// be successful.
/// * A single ping failure is sufficient for the connection to be subject
/// to being closed.
/// * The connection may be closed at any time as far as the ping protocol
/// is concerned, i.e. the ping protocol itself does not keep the
/// connection alive.
pub fn new() -> Self {
Self {
timeout: Duration::from_secs(20),
interval: Duration::from_secs(15),
max_failures: NonZeroU32::new(1).expect("1 != 0"),
}
}
@ -90,13 +78,6 @@ impl Config {
self.interval = d;
self
}
/// Sets the maximum number of consecutive ping failures upon which the remote
/// peer is considered unreachable and the connection closed.
pub fn with_max_failures(mut self, n: NonZeroU32) -> Self {
self.max_failures = n;
self
}
}
impl Default for Config {
@ -105,17 +86,6 @@ impl Default for Config {
}
}
/// The successful result of processing an inbound or outbound ping.
#[derive(Debug)]
pub enum Success {
/// Received a ping and sent back a pong.
Pong,
/// Sent a ping and received back a pong.
///
/// Includes the round-trip time.
Ping { rtt: Duration },
}
/// An outbound ping failure.
#[derive(Debug)]
pub enum Failure {
@ -130,6 +100,12 @@ pub enum Failure {
},
}
impl Failure {
fn other(e: impl std::error::Error + Send + 'static) -> Self {
Self::Other { error: Box::new(e) }
}
}
impl fmt::Display for Failure {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
@ -152,14 +128,11 @@ impl Error for Failure {
/// Protocol handler that handles pinging the remote at a regular period
/// and answering ping queries.
///
/// If the remote doesn't respond, produces an error that closes the connection.
pub struct Handler {
/// Configuration options.
config: Config,
/// The timer used for the delay to the next ping as well as
/// the ping timeout.
timer: Delay,
/// The timer used for the delay to the next ping.
interval: Delay,
/// Outbound ping failures that are pending to be processed by `poll()`.
pending_errors: VecDeque<Failure>,
/// The number of consecutive ping failures that occurred.
@ -174,6 +147,8 @@ pub struct Handler {
inbound: Option<PongFuture>,
/// Tracks the state of our handler.
state: State,
/// The peer we are connected to.
peer: PeerId,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
@ -191,10 +166,11 @@ enum State {
impl Handler {
/// Builds a new [`Handler`] with the given configuration.
pub fn new(config: Config) -> Self {
pub fn new(config: Config, peer: PeerId) -> Self {
Handler {
peer,
config,
timer: Delay::new(Duration::new(0, 0)),
interval: Delay::new(Duration::new(0, 0)),
pending_errors: VecDeque::with_capacity(2),
failures: 0,
outbound: None,
@ -220,8 +196,12 @@ impl Handler {
return;
}
// Note: This timeout only covers protocol negotiation.
StreamUpgradeError::Timeout => Failure::Timeout,
e => Failure::Other { error: Box::new(e) },
StreamUpgradeError::Timeout => {
debug_assert!(false, "ReadyUpgrade cannot time out");
return;
}
StreamUpgradeError::Apply(e) => void::unreachable(e),
StreamUpgradeError::Io(e) => Failure::Other { error: Box::new(e) },
};
self.pending_errors.push_front(error);
@ -230,8 +210,8 @@ impl Handler {
impl ConnectionHandler for Handler {
type FromBehaviour = Void;
type ToBehaviour = crate::Result;
type Error = Failure;
type ToBehaviour = Result<Duration, Failure>;
type Error = Void;
type InboundProtocol = ReadyUpgrade<StreamProtocol>;
type OutboundProtocol = ReadyUpgrade<StreamProtocol>;
type OutboundOpenInfo = ();
@ -250,8 +230,14 @@ impl ConnectionHandler for Handler {
fn poll(
&mut self,
cx: &mut Context<'_>,
) -> Poll<ConnectionHandlerEvent<ReadyUpgrade<StreamProtocol>, (), crate::Result, Self::Error>>
{
) -> Poll<
ConnectionHandlerEvent<
ReadyUpgrade<StreamProtocol>,
(),
Result<Duration, Failure>,
Self::Error,
>,
> {
match self.state {
State::Inactive { reported: true } => {
return Poll::Pending; // nothing to do on this connection
@ -274,9 +260,10 @@ impl ConnectionHandler for Handler {
self.inbound = None;
}
Poll::Ready(Ok(stream)) => {
log::trace!("answered inbound ping from {}", self.peer);
// A ping from a remote peer has been answered, wait for the next.
self.inbound = Some(protocol::recv_ping(stream).boxed());
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(Ok(Success::Pong)));
}
}
}
@ -288,19 +275,12 @@ impl ConnectionHandler for Handler {
self.failures += 1;
// Note: For backward-compatibility, with configured
// `max_failures == 1`, the first failure is always "free"
// and silent. This allows peers who still use a new substream
// Note: For backward-compatibility the first failure is always "free"
// and silent. This allows peers who use a new substream
// for each ping to have successful ping exchanges with peers
// that use a single substream, since every successful ping
// resets `failures` to `0`, while at the same time emitting
// events only for `max_failures - 1` failures, as before.
if self.failures > 1 || self.config.max_failures.get() > 1 {
if self.failures >= self.config.max_failures.get() {
log::debug!("Too many failures ({}). Closing connection.", self.failures);
return Poll::Ready(ConnectionHandlerEvent::Close(error));
}
// resets `failures` to `0`.
if self.failures > 1 {
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(Err(error)));
}
}
@ -309,35 +289,30 @@ impl ConnectionHandler for Handler {
match self.outbound.take() {
Some(OutboundState::Ping(mut ping)) => match ping.poll_unpin(cx) {
Poll::Pending => {
if self.timer.poll_unpin(cx).is_ready() {
self.pending_errors.push_front(Failure::Timeout);
} else {
self.outbound = Some(OutboundState::Ping(ping));
break;
}
self.outbound = Some(OutboundState::Ping(ping));
break;
}
Poll::Ready(Ok((stream, rtt))) => {
log::debug!("latency to {} is {}ms", self.peer, rtt.as_millis());
self.failures = 0;
self.timer.reset(self.config.interval);
self.interval.reset(self.config.interval);
self.outbound = Some(OutboundState::Idle(stream));
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(Ok(
Success::Ping { rtt },
)));
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(Ok(rtt)));
}
Poll::Ready(Err(e)) => {
self.pending_errors
.push_front(Failure::Other { error: Box::new(e) });
self.pending_errors.push_front(e);
}
},
Some(OutboundState::Idle(stream)) => match self.timer.poll_unpin(cx) {
Some(OutboundState::Idle(stream)) => match self.interval.poll_unpin(cx) {
Poll::Pending => {
self.outbound = Some(OutboundState::Idle(stream));
break;
}
Poll::Ready(()) => {
self.timer.reset(self.config.timeout);
self.outbound =
Some(OutboundState::Ping(protocol::send_ping(stream).boxed()));
self.outbound = Some(OutboundState::Ping(
send_ping(stream, self.config.timeout).boxed(),
));
}
},
Some(OutboundState::OpenStream) => {
@ -346,8 +321,7 @@ impl ConnectionHandler for Handler {
}
None => {
self.outbound = Some(OutboundState::OpenStream);
let protocol = SubstreamProtocol::new(ReadyUpgrade::new(PROTOCOL_NAME), ())
.with_timeout(self.config.timeout);
let protocol = SubstreamProtocol::new(ReadyUpgrade::new(PROTOCOL_NAME), ());
return Poll::Ready(ConnectionHandlerEvent::OutboundSubstreamRequest {
protocol,
});
@ -378,8 +352,9 @@ impl ConnectionHandler for Handler {
protocol: stream,
..
}) => {
self.timer.reset(self.config.timeout);
self.outbound = Some(OutboundState::Ping(protocol::send_ping(stream).boxed()));
self.outbound = Some(OutboundState::Ping(
send_ping(stream, self.config.timeout).boxed(),
));
}
ConnectionEvent::DialUpgradeError(dial_upgrade_error) => {
self.on_dial_upgrade_error(dial_upgrade_error)
@ -392,7 +367,7 @@ impl ConnectionHandler for Handler {
}
}
type PingFuture = BoxFuture<'static, Result<(Stream, Duration), io::Error>>;
type PingFuture = BoxFuture<'static, Result<(Stream, Duration), Failure>>;
type PongFuture = BoxFuture<'static, Result<Stream, io::Error>>;
/// The current state w.r.t. outbound pings.
@ -404,3 +379,15 @@ enum OutboundState {
/// A ping is being sent and the response awaited.
Ping(PingFuture),
}
/// A wrapper around [`protocol::send_ping`] that enforces a time out.
async fn send_ping(stream: Stream, timeout: Duration) -> Result<(Stream, Duration), Failure> {
let ping = protocol::send_ping(stream);
futures::pin_mut!(ping);
match future::select(ping, Delay::new(timeout)).await {
Either::Left((Ok((stream, rtt)), _)) => Ok((stream, rtt)),
Either::Left((Err(e), _)) => Err(Failure::other(e)),
Either::Right(((), _)) => Err(Failure::Timeout),
}
}

View File

@ -26,13 +26,21 @@
//!
//! # Usage
//!
//! The [`Behaviour`] struct implements the [`NetworkBehaviour`] trait. When used with a [`Swarm`],
//! it will respond to inbound ping requests and as necessary periodically send outbound
//! ping requests on every established connection. If a configurable number of consecutive
//! pings fail, the connection will be closed.
//! The [`Behaviour`] struct implements the [`NetworkBehaviour`] trait.
//! It will respond to inbound ping requests and periodically send outbound ping requests on every established connection.
//!
//! The [`Behaviour`] network behaviour produces [`Event`]s, which may be consumed from the [`Swarm`]
//! by an application, e.g. to collect statistics.
//! It is up to the user to implement a health-check / connection management policy based on the ping protocol.
//!
//! For example:
//!
//! - Disconnect from peers with an RTT > 200ms
//! - Disconnect from peers which don't support the ping protocol
//! - Disconnect from peers upon the first ping failure
//!
//! Users should inspect emitted [`Event`]s and call APIs on [`Swarm`]:
//!
//! - [`Swarm::close_connection`](libp2p_swarm::Swarm::close_connection) to close a specific connection
//! - [`Swarm::disconnect_peer_id`](libp2p_swarm::Swarm::disconnect_peer_id) to close all connections to a peer
//!
//! [`Swarm`]: libp2p_swarm::Swarm
//! [`Transport`]: libp2p_core::Transport
@ -43,22 +51,20 @@ mod handler;
mod protocol;
use handler::Handler;
pub use handler::{Config, Failure, Success};
use libp2p_core::{Endpoint, Multiaddr};
use libp2p_identity::PeerId;
use libp2p_swarm::{
behaviour::FromSwarm, ConnectionDenied, ConnectionId, NetworkBehaviour, PollParameters,
THandler, THandlerInEvent, THandlerOutEvent, ToSwarm,
};
use std::time::Duration;
use std::{
collections::VecDeque,
task::{Context, Poll},
};
pub use self::protocol::PROTOCOL_NAME;
/// The result of an inbound or outbound ping.
pub type Result = std::result::Result<Success, Failure>;
pub use handler::{Config, Failure};
/// A [`NetworkBehaviour`] that responds to inbound pings and
/// periodically sends outbound pings on every established connection.
@ -76,8 +82,10 @@ pub struct Behaviour {
pub struct Event {
/// The peer ID of the remote.
pub peer: PeerId,
/// The connection the ping was executed on.
pub connection: ConnectionId,
/// The result of an inbound or outbound ping.
pub result: Result,
pub result: Result<Duration, Failure>,
}
impl Behaviour {
@ -103,30 +111,34 @@ impl NetworkBehaviour for Behaviour {
fn handle_established_inbound_connection(
&mut self,
_: ConnectionId,
_: PeerId,
peer: PeerId,
_: &Multiaddr,
_: &Multiaddr,
) -> std::result::Result<THandler<Self>, ConnectionDenied> {
Ok(Handler::new(self.config.clone()))
) -> Result<THandler<Self>, ConnectionDenied> {
Ok(Handler::new(self.config.clone(), peer))
}
fn handle_established_outbound_connection(
&mut self,
_: ConnectionId,
_: PeerId,
peer: PeerId,
_: &Multiaddr,
_: Endpoint,
) -> std::result::Result<THandler<Self>, ConnectionDenied> {
Ok(Handler::new(self.config.clone()))
) -> Result<THandler<Self>, ConnectionDenied> {
Ok(Handler::new(self.config.clone(), peer))
}
fn on_connection_handler_event(
&mut self,
peer: PeerId,
_: ConnectionId,
connection: ConnectionId,
result: THandlerOutEvent<Self>,
) {
self.events.push_front(Event { peer, result })
self.events.push_front(Event {
peer,
connection,
result,
})
}
fn poll(
@ -135,24 +147,13 @@ impl NetworkBehaviour for Behaviour {
_: &mut impl PollParameters,
) -> Poll<ToSwarm<Self::ToSwarm, THandlerInEvent<Self>>> {
if let Some(e) = self.events.pop_back() {
let Event { result, peer } = &e;
match result {
Ok(Success::Ping { .. }) => log::debug!("Ping sent to {:?}", peer),
Ok(Success::Pong) => log::debug!("Ping received from {:?}", peer),
_ => {}
}
Poll::Ready(ToSwarm::GenerateEvent(e))
} else {
Poll::Pending
}
}
fn on_swarm_event(
&mut self,
event: libp2p_swarm::behaviour::FromSwarm<Self::ConnectionHandler>,
) {
fn on_swarm_event(&mut self, event: FromSwarm<Self::ConnectionHandler>) {
match event {
FromSwarm::ConnectionEstablished(_)
| FromSwarm::ConnectionClosed(_)

View File

@ -20,7 +20,6 @@
//! Integration tests for the `Ping` network behaviour.
use futures::prelude::*;
use libp2p_ping as ping;
use libp2p_swarm::keep_alive;
use libp2p_swarm::{NetworkBehaviour, Swarm, SwarmEvent};
@ -59,63 +58,9 @@ fn ping_pong() {
}
fn assert_ping_rtt_less_than_50ms(e: ping::Event) {
let success = e.result.expect("a ping success");
let rtt = e.result.expect("a ping success");
if let ping::Success::Ping { rtt } = success {
assert!(rtt < Duration::from_millis(50))
}
}
/// Tests that the connection is closed upon a configurable
/// number of consecutive ping failures.
#[test]
fn max_failures() {
fn prop(max_failures: NonZeroU8) {
let cfg = ping::Config::new()
.with_interval(Duration::from_millis(10))
.with_timeout(Duration::from_millis(0))
.with_max_failures(max_failures.into());
let mut swarm1 = Swarm::new_ephemeral(|_| Behaviour::new(cfg.clone()));
let mut swarm2 = Swarm::new_ephemeral(|_| Behaviour::new(cfg.clone()));
let (count1, count2) = async_std::task::block_on(async {
swarm1.listen().await;
swarm2.connect(&mut swarm1).await;
future::join(
count_ping_failures_until_connection_closed(swarm1),
count_ping_failures_until_connection_closed(swarm2),
)
.await
});
assert_eq!(u8::max(count1, count2), max_failures.get() - 1);
}
QuickCheck::new().tests(10).quickcheck(prop as fn(_))
}
async fn count_ping_failures_until_connection_closed(mut swarm: Swarm<Behaviour>) -> u8 {
let mut failure_count = 0;
loop {
match swarm.next_swarm_event().await {
SwarmEvent::Behaviour(BehaviourEvent::Ping(ping::Event {
result: Ok(ping::Success::Ping { .. }),
..
})) => {
failure_count = 0; // there may be an occasional success
}
SwarmEvent::Behaviour(BehaviourEvent::Ping(ping::Event { result: Err(_), .. })) => {
failure_count += 1;
}
SwarmEvent::ConnectionClosed { .. } => {
return failure_count;
}
_ => {}
}
}
assert!(rtt < Duration::from_millis(50))
}
#[test]

View File

@ -699,6 +699,24 @@ where
}
}
/// Attempt to gracefully close a connection.
///
/// Closing a connection is asynchronous but this function will return immediately.
/// A [`SwarmEvent::ConnectionClosed`] event will be emitted once the connection is actually closed.
///
/// # Returns
///
/// - `true` if the connection was established and is now being closed.
/// - `false` if the connection was not found or is no longer established.
pub fn close_connection(&mut self, connection_id: ConnectionId) -> bool {
if let Some(established) = self.pool.get_established(connection_id) {
established.start_close();
return true;
}
false
}
/// Checks whether there is an established connection to a peer.
pub fn is_connected(&self, peer_id: &PeerId) -> bool {
self.pool.is_connected(*peer_id)