Skip to content

Commit c650ea8

Browse files
ramfox“ramfox”dignifiedquire
authored
fix(iroh-relay): removes deadlock in Clients (#3099)
## Description Remove the deadlock that was a bit hidden due to `DashMap`. Added `client` parameter to `unregister` to explicitly drop before attempting to call into the `clients` `DashMap` again. ## Notes & open questions The `warn` log for stream termination seems a little fear-mongering, but I'm not sure the best way to "downgrade" this, as we seem to rely on this error in tests like `endpoint_relay_connect_loop`. Ended up leaving it as is. ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. --------- Co-authored-by: “ramfox” <“[email protected]”> Co-authored-by: Friedel Ziegelmayer <[email protected]>
1 parent 60ba9ac commit c650ea8

File tree

1 file changed

+56
-35
lines changed

1 file changed

+56
-35
lines changed

iroh-relay/src/server/clients.rs

Lines changed: 56 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use iroh_metrics::inc;
1111
use tokio::sync::mpsc::error::TrySendError;
1212
use tracing::{debug, trace};
1313

14-
use super::client::{Client, Config, Packet};
14+
use super::client::{Client, Config};
1515
use crate::server::metrics::Metrics;
1616

1717
/// Manages the connections to all currently connected clients.
@@ -56,7 +56,14 @@ impl Clients {
5656
/// Removes the client from the map of clients, & sends a notification
5757
/// to each client that peers has sent data to, to let them know that
5858
/// peer is gone from the network.
59-
async fn unregister(&self, node_id: NodeId) {
59+
///
60+
/// Explicitly drops the reference to the client to avoid deadlock.
61+
async fn unregister<'a>(
62+
&self,
63+
client: dashmap::mapref::one::Ref<'a, iroh_base::PublicKey, Client>,
64+
node_id: NodeId,
65+
) {
66+
drop(client); // avoid deadlock
6067
trace!(node_id = node_id.fmt_short(), "unregistering client");
6168

6269
if let Some((_, client)) = self.0.clients.remove(&node_id) {
@@ -83,42 +90,53 @@ impl Clients {
8390
}
8491
}
8592

86-
/// Attempt to send a packet to client with [`NodeId`] `dst`
93+
/// Attempt to send a packet to client with [`NodeId`] `dst`.
8794
pub(super) async fn send_packet(&self, dst: NodeId, data: Bytes, src: NodeId) -> Result<()> {
88-
if let Some(client) = self.0.clients.get(&dst) {
89-
let res = client.try_send_packet(src, data);
90-
return self.process_result(src, dst, res).await;
95+
let Some(client) = self.0.clients.get(&dst) else {
96+
debug!(dst = dst.fmt_short(), "no connected client, dropped packet");
97+
inc!(Metrics, send_packets_dropped);
98+
return Ok(());
99+
};
100+
match client.try_send_packet(src, data) {
101+
Ok(_) => {
102+
// Record sent_to relationship
103+
self.0.sent_to.entry(src).or_default().insert(dst);
104+
Ok(())
105+
}
106+
Err(TrySendError::Full(_)) => {
107+
debug!(
108+
dst = dst.fmt_short(),
109+
"client too busy to receive packet, dropping packet"
110+
);
111+
bail!("failed to send message: full");
112+
}
113+
Err(TrySendError::Closed(_)) => {
114+
debug!(
115+
dst = dst.fmt_short(),
116+
"can no longer write to client, dropping message and pruning connection"
117+
);
118+
self.unregister(client, dst).await;
119+
bail!("failed to send message: gone");
120+
}
91121
}
92-
debug!(dst = dst.fmt_short(), "no connected client, dropped packet");
93-
inc!(Metrics, send_packets_dropped);
94-
Ok(())
95122
}
96123

124+
/// Attempt to send a disco packet to client with [`NodeId`] `dst`.
97125
pub(super) async fn send_disco_packet(
98126
&self,
99127
dst: NodeId,
100128
data: Bytes,
101129
src: NodeId,
102130
) -> Result<()> {
103-
if let Some(client) = self.0.clients.get(&dst) {
104-
let res = client.try_send_disco_packet(src, data);
105-
return self.process_result(src, dst, res).await;
106-
}
107-
debug!(
108-
dst = dst.fmt_short(),
109-
"no connected client, dropped disco packet"
110-
);
111-
inc!(Metrics, disco_packets_dropped);
112-
Ok(())
113-
}
114-
115-
async fn process_result(
116-
&self,
117-
src: NodeId,
118-
dst: NodeId,
119-
res: Result<(), TrySendError<Packet>>,
120-
) -> Result<()> {
121-
match res {
131+
let Some(client) = self.0.clients.get(&dst) else {
132+
debug!(
133+
dst = dst.fmt_short(),
134+
"no connected client, dropped disco packet"
135+
);
136+
inc!(Metrics, disco_packets_dropped);
137+
return Ok(());
138+
};
139+
match client.try_send_disco_packet(src, data) {
122140
Ok(_) => {
123141
// Record sent_to relationship
124142
self.0.sent_to.entry(src).or_default().insert(dst);
@@ -127,17 +145,17 @@ impl Clients {
127145
Err(TrySendError::Full(_)) => {
128146
debug!(
129147
dst = dst.fmt_short(),
130-
"client too busy to receive packet, dropping packet"
148+
"client too busy to receive disco packet, dropping packet"
131149
);
132-
bail!("failed to send message");
150+
bail!("failed to send message: full");
133151
}
134152
Err(TrySendError::Closed(_)) => {
135153
debug!(
136154
dst = dst.fmt_short(),
137-
"can no longer write to client, dropping message and pruning connection"
155+
"can no longer write to client, dropping disco message and pruning connection"
138156
);
139-
self.unregister(dst).await;
140-
bail!("failed to send message");
157+
self.unregister(client, dst).await;
158+
bail!("failed to send message: gone");
141159
}
142160
}
143161
}
@@ -212,8 +230,11 @@ mod tests {
212230
}
213231
);
214232

215-
// send peer_gone
216-
clients.unregister(a_key).await;
233+
let client = clients.0.clients.get(&a_key).unwrap();
234+
235+
// send peer_gone. Also, tests that we do not get a deadlock
236+
// when unregistering.
237+
clients.unregister(client, a_key).await;
217238

218239
assert!(!clients.0.clients.contains_key(&a_key));
219240
clients.shutdown().await;

0 commit comments

Comments
 (0)