Skip to content

Commit b76500d

Browse files
authored
feat: Implement RelayDatagramsQueue (#2998)
Based on #2986 ## Description <!-- A summary of what this pull request achieves and a rough list of changes. --> Replaces `RelayRecvReceiver` and `RelayRecvSender` with the (clonable & shared) `RelayDatagramsQueue`. This queue contains a `ConcurrentQueue` (from the smol library `concurrent-queue`) and an `AtomicWaker`. It should only be polled from one task. If polled from multiple tasks, then tasks will overwrite each other's wakers. Unfortunately we can't make it use `&mut Self` in `poll_recv` because `quinn` expects the `AsyncUdpSocket`s `poll_recv` interface to be `&self`. This (un)fortunately doesn't have an effect on performance for me. (The benchmark is completely broken half the time for some reason, but when it runs it produces normal numbers:) ```sh $ DEV_RELAY_ONLY=true cargo run -p iroh-net-bench --release --features=local-relay -- iroh --with-relay --download-size=100M │ Throughput │ Duration ──────┼───────────────┼────────── AVG │ 55.05 MiB/s │ 1.82s P0 │ 55.03 MiB/s │ 1.82s P10 │ 55.06 MiB/s │ 1.82s P50 │ 55.06 MiB/s │ 1.82s P90 │ 55.06 MiB/s │ 1.82s P100 │ 55.06 MiB/s │ 1.82s ``` And basically exactly the same times for the PR this is based on. ## Breaking Changes <!-- Optional, if there are any breaking changes document them, including how to migrate older code. --> ## Notes & open questions <!-- Any notes, remarks or open questions you have to make about the PR. --> ## Todo - [x] Rename variables to e.g. `relay_datagrams_queue` instead of `relay_recv_sender` or `relay_recv_channel` etc. - [x] Add documentation about multiple tasks polling, etc. ## Change checklist - [x] Self-review. - [x] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [x] Tests if relevant. - [x] All breaking changes documented.
1 parent e575af2 commit b76500d

File tree

4 files changed

+144
-68
lines changed

4 files changed

+144
-68
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

iroh/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ workspace = true
1717

1818
[dependencies]
1919
anyhow = { version = "1" }
20+
concurrent-queue = "2.5"
2021
axum = { version = "0.7", optional = true }
2122
backoff = "0.4.0"
2223
base64 = "0.22.1"

iroh/src/magicsock.rs

Lines changed: 123 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@ use std::{
3131

3232
use anyhow::{anyhow, Context as _, Result};
3333
use bytes::Bytes;
34+
use concurrent_queue::ConcurrentQueue;
3435
use futures_lite::{FutureExt, Stream, StreamExt};
35-
use futures_util::stream::BoxStream;
36+
use futures_util::{stream::BoxStream, task::AtomicWaker};
3637
use iroh_base::key::NodeId;
3738
use iroh_metrics::{inc, inc_by};
3839
use iroh_relay::protos::stun;
@@ -180,12 +181,12 @@ pub(crate) struct MagicSock {
180181
me: String,
181182
/// Proxy
182183
proxy_url: Option<Url>,
183-
/// Channel to receive datagrams from relays for [`AsyncUdpSocket::poll_recv`].
184+
/// Queue to receive datagrams from relays for [`AsyncUdpSocket::poll_recv`].
184185
///
185-
/// QUIC datagrams received by relays are put on this channel and consumed by
186-
/// [`AsyncUdpSocket`]. This channel takes care of the wakers needed by
186+
/// Relay datagrams received by relays are put into this queue and consumed by
187+
/// [`AsyncUdpSocket`]. This queue takes care of the wakers needed by
187188
/// [`AsyncUdpSocket::poll_recv`].
188-
relay_recv_channel: RelayRecvReceiver,
189+
relay_datagrams_queue: Arc<RelayDatagramsQueue>,
189190

190191
network_send_wakers: Arc<parking_lot::Mutex<Option<Waker>>>,
191192
/// Counter for ordering of [`MagicSock::poll_recv`] polling order.
@@ -860,7 +861,7 @@ impl MagicSock {
860861
// For each output buffer keep polling the datagrams from the relay until one is
861862
// a QUIC datagram to be placed into the output buffer. Or the channel is empty.
862863
loop {
863-
let recv = match self.relay_recv_channel.poll_recv(cx) {
864+
let recv = match self.relay_datagrams_queue.poll_recv(cx) {
864865
Poll::Ready(Ok(recv)) => recv,
865866
Poll::Ready(Err(err)) => {
866867
error!("relay_recv_channel closed: {err:#}");
@@ -1510,7 +1511,7 @@ impl Handle {
15101511
insecure_skip_relay_cert_verify,
15111512
} = opts;
15121513

1513-
let (relay_recv_tx, relay_recv_rx) = relay_recv_channel();
1514+
let relay_datagrams_queue = Arc::new(RelayDatagramsQueue::new());
15141515

15151516
let (pconn4, pconn6) = bind(addr_v4, addr_v6)?;
15161517
let port = pconn4.port();
@@ -1547,7 +1548,7 @@ impl Handle {
15471548
local_addrs: std::sync::RwLock::new((ipv4_addr, ipv6_addr)),
15481549
closing: AtomicBool::new(false),
15491550
closed: AtomicBool::new(false),
1550-
relay_recv_channel: relay_recv_rx,
1551+
relay_datagrams_queue: relay_datagrams_queue.clone(),
15511552
network_send_wakers: Arc::new(parking_lot::Mutex::new(None)),
15521553
poll_recv_counter: AtomicUsize::new(0),
15531554
actor_sender: actor_sender.clone(),
@@ -1572,7 +1573,7 @@ impl Handle {
15721573

15731574
let mut actor_tasks = JoinSet::default();
15741575

1575-
let relay_actor = RelayActor::new(inner.clone(), relay_recv_tx);
1576+
let relay_actor = RelayActor::new(inner.clone(), relay_datagrams_queue);
15761577
let relay_actor_cancel_token = relay_actor.cancel_token();
15771578
actor_tasks.spawn(
15781579
async move {
@@ -1712,64 +1713,74 @@ enum DiscoBoxError {
17121713
Parse(anyhow::Error),
17131714
}
17141715

1715-
/// Channel for [`MagicSock::poll_recv_relay`] to receive datagrams from relays.
1716+
/// A queue holding [`RelayRecvDatagram`]s that can be polled in async
1717+
/// contexts, and wakes up tasks when something adds items using [`try_send`].
17161718
///
1717-
/// The sender and receiver will take care of the required wakers needed for
1718-
/// [`AsyncUdpSocket::poll_recv`].
1719-
// TODO: This channel should possibly be implemented with concurrent-queue and atomic-waker.
1720-
// Or maybe async-channel.
1721-
fn relay_recv_channel() -> (RelayRecvSender, RelayRecvReceiver) {
1722-
let (tx, rx) = mpsc::channel(128);
1723-
let waker = Arc::new(parking_lot::Mutex::new(None));
1724-
let sender = RelayRecvSender {
1725-
sender: tx,
1726-
waker: waker.clone(),
1727-
};
1728-
let receiver = RelayRecvReceiver {
1729-
receiver: parking_lot::Mutex::new(rx),
1730-
waker,
1731-
};
1732-
(sender, receiver)
1719+
/// This is used to transfer relay datagrams between the [`RelayActor`]
1720+
/// and [`MagicSock`].
1721+
///
1722+
/// [`try_send`]: Self::try_send
1723+
/// [`RelayActor`]: crate::magicsock::RelayActor
1724+
/// [`MagicSock`]: crate::magicsock::MagicSock
1725+
#[derive(Debug)]
1726+
struct RelayDatagramsQueue {
1727+
queue: ConcurrentQueue<RelayRecvDatagram>,
1728+
waker: AtomicWaker,
17331729
}
17341730

1735-
#[derive(Debug, Clone)]
1736-
struct RelayRecvSender {
1737-
sender: mpsc::Sender<RelayRecvDatagram>,
1738-
waker: Arc<parking_lot::Mutex<Option<Waker>>>,
1739-
}
1731+
impl RelayDatagramsQueue {
1732+
/// Creates a new, empty queue with a fixed size bound of 128 items.
1733+
fn new() -> Self {
1734+
Self {
1735+
queue: ConcurrentQueue::bounded(128),
1736+
waker: AtomicWaker::new(),
1737+
}
1738+
}
17401739

1741-
impl RelayRecvSender {
1740+
/// Sends an item into this queue and wakes a potential task
1741+
/// that's registered its waker with a [`poll_recv`] call.
1742+
///
1743+
/// [`poll_recv`]: Self::poll_recv
17421744
fn try_send(
17431745
&self,
17441746
item: RelayRecvDatagram,
1745-
) -> Result<(), mpsc::error::TrySendError<RelayRecvDatagram>> {
1746-
self.sender.try_send(item).inspect(|_| {
1747-
if let Some(waker) = self.waker.lock().take() {
1748-
waker.wake();
1749-
}
1747+
) -> Result<(), concurrent_queue::PushError<RelayRecvDatagram>> {
1748+
self.queue.push(item).inspect(|_| {
1749+
self.waker.wake();
17501750
})
17511751
}
1752-
}
17531752

1754-
#[derive(Debug)]
1755-
struct RelayRecvReceiver {
1756-
receiver: parking_lot::Mutex<mpsc::Receiver<RelayRecvDatagram>>,
1757-
waker: Arc<parking_lot::Mutex<Option<Waker>>>,
1758-
}
1759-
1760-
impl RelayRecvReceiver {
1753+
/// Polls for new items in the queue.
1754+
///
1755+
/// Although this method is available from `&self`, it must not be
1756+
/// polled concurrently between tasks.
1757+
///
1758+
/// Calling this will replace the current waker used. So if another task
1759+
/// waits for this, that task's waker will be replaced and it won't be
1760+
/// woken up for new items.
1761+
///
1762+
/// The reason this method is made available as `&self` is because
1763+
/// the interface for quinn's [`AsyncUdpSocket::poll_recv`] requires us
1764+
/// to be able to poll from `&self`.
17611765
fn poll_recv(&self, cx: &mut Context) -> Poll<Result<RelayRecvDatagram>> {
1762-
let mut receiver = self.receiver.lock();
1763-
self.waker.lock().replace(cx.waker().clone());
1764-
match receiver.try_recv() {
1765-
Ok(item) => {
1766-
self.waker.lock().take();
1767-
Poll::Ready(Ok(item))
1768-
}
1769-
Err(mpsc::error::TryRecvError::Empty) => Poll::Pending,
1770-
Err(mpsc::error::TryRecvError::Disconnected) => {
1771-
Poll::Ready(Err(anyhow!("All RelayRecvSenders disconnected")))
1766+
match self.queue.pop() {
1767+
Ok(value) => Poll::Ready(Ok(value)),
1768+
Err(concurrent_queue::PopError::Empty) => {
1769+
self.waker.register(cx.waker());
1770+
1771+
match self.queue.pop() {
1772+
Ok(value) => {
1773+
self.waker.take();
1774+
Poll::Ready(Ok(value))
1775+
}
1776+
Err(concurrent_queue::PopError::Empty) => Poll::Pending,
1777+
Err(concurrent_queue::PopError::Closed) => {
1778+
self.waker.take();
1779+
Poll::Ready(Err(anyhow!("Queue closed")))
1780+
}
1781+
}
17721782
}
1783+
Err(concurrent_queue::PopError::Closed) => Poll::Ready(Err(anyhow!("Queue closed"))),
17731784
}
17741785
}
17751786
}
@@ -2857,7 +2868,10 @@ mod tests {
28572868
use tokio_util::task::AbortOnDropHandle;
28582869

28592870
use super::*;
2860-
use crate::{defaults::staging::EU_RELAY_HOSTNAME, tls, Endpoint, RelayMode};
2871+
use crate::{
2872+
defaults::staging::{self, EU_RELAY_HOSTNAME},
2873+
tls, Endpoint, RelayMode,
2874+
};
28612875

28622876
const ALPN: &[u8] = b"n0/test/1";
28632877

@@ -4020,4 +4034,57 @@ mod tests {
40204034
// TODO: could remove the addresses again, send, add it back and see it recover.
40214035
// But we don't have that much private access to the NodeMap. This will do for now.
40224036
}
4037+
4038+
#[tokio::test(flavor = "multi_thread")]
4039+
async fn test_relay_datagram_queue() {
4040+
let queue = Arc::new(RelayDatagramsQueue::new());
4041+
let url = staging::default_na_relay_node().url;
4042+
let capacity = queue.queue.capacity().unwrap();
4043+
4044+
let mut tasks = JoinSet::new();
4045+
4046+
tasks.spawn({
4047+
let queue = queue.clone();
4048+
async move {
4049+
let mut expected_msgs = vec![false; capacity];
4050+
4051+
while let Ok(datagram) = tokio::time::timeout(
4052+
Duration::from_millis(100),
4053+
futures_lite::future::poll_fn(|cx| {
4054+
queue.poll_recv(cx).map(|result| result.unwrap())
4055+
}),
4056+
)
4057+
.await
4058+
{
4059+
let msg_num = usize::from_le_bytes(datagram.buf.as_ref().try_into().unwrap());
4060+
4061+
if expected_msgs[msg_num] {
4062+
panic!("Received message number {msg_num} more than once (duplicated)");
4063+
}
4064+
4065+
expected_msgs[msg_num] = true;
4066+
}
4067+
4068+
assert!(expected_msgs.into_iter().all(|is_set| is_set));
4069+
}
4070+
});
4071+
4072+
for i in 0..capacity {
4073+
tasks.spawn({
4074+
let queue = queue.clone();
4075+
let url = url.clone();
4076+
async move {
4077+
queue
4078+
.try_send(RelayRecvDatagram {
4079+
url,
4080+
src: PublicKey::from_bytes(&[0u8; 32]).unwrap(),
4081+
buf: Bytes::copy_from_slice(&i.to_le_bytes()),
4082+
})
4083+
.unwrap();
4084+
}
4085+
});
4086+
}
4087+
4088+
tasks.join_all().await;
4089+
}
40234090
}

iroh/src/magicsock/relay_actor.rs

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument};
2525

2626
use crate::{
2727
key::{NodeId, PUBLIC_KEY_LENGTH},
28-
magicsock::{MagicSock, Metrics as MagicsockMetrics, RelayContents, RelayRecvSender},
28+
magicsock::{MagicSock, Metrics as MagicsockMetrics, RelayContents, RelayDatagramsQueue},
2929
};
3030

3131
/// How long a non-home relay connection needs to be idle (last written to) before we close it.
@@ -52,8 +52,8 @@ struct ConnectedRelayActor {
5252
/// The time of the last request for its write
5353
/// channel (currently even if there was no write).
5454
last_write: Instant,
55-
/// Channel to send received QUIC datagrams on.
56-
relay_recv_channel: RelayRecvSender,
55+
/// Queue to send received relay datagrams on.
56+
relay_datagrams_queue: Arc<RelayDatagramsQueue>,
5757
url: RelayUrl,
5858
relay_client: relay::client::Client,
5959
relay_client_receiver: relay::client::ClientReceiver,
@@ -84,11 +84,11 @@ impl ConnectedRelayActor {
8484
url: RelayUrl,
8585
relay_client: relay::client::Client,
8686
relay_client_receiver: relay::client::ClientReceiver,
87-
relay_recv_channel: RelayRecvSender,
87+
relay_datagrams_queue: Arc<RelayDatagramsQueue>,
8888
) -> Self {
8989
ConnectedRelayActor {
9090
last_write: Instant::now(),
91-
relay_recv_channel,
91+
relay_datagrams_queue,
9292
url,
9393
node_present: BTreeSet::new(),
9494
backoff: backoff::exponential::ExponentialBackoffBuilder::new()
@@ -246,7 +246,7 @@ impl ConnectedRelayActor {
246246
src: remote_node_id,
247247
buf: datagram,
248248
};
249-
if let Err(err) = self.relay_recv_channel.try_send(res) {
249+
if let Err(err) = self.relay_datagrams_queue.try_send(res) {
250250
warn!("dropping received relay packet: {err:#}");
251251
}
252252
}
@@ -282,19 +282,22 @@ impl ConnectedRelayActor {
282282

283283
pub(super) struct RelayActor {
284284
msock: Arc<MagicSock>,
285-
relay_recv_channel: RelayRecvSender,
285+
relay_datagrams_queue: Arc<RelayDatagramsQueue>,
286286
/// relay Url -> connection to the node
287287
connected_relays: BTreeMap<RelayUrl, (mpsc::Sender<ConnectedRelayMessage>, JoinHandle<()>)>,
288288
ping_tasks: JoinSet<(RelayUrl, bool)>,
289289
cancel_token: CancellationToken,
290290
}
291291

292292
impl RelayActor {
293-
pub(super) fn new(msock: Arc<MagicSock>, recv_channel: RelayRecvSender) -> Self {
293+
pub(super) fn new(
294+
msock: Arc<MagicSock>,
295+
relay_datagrams_queue: Arc<RelayDatagramsQueue>,
296+
) -> Self {
294297
let cancel_token = CancellationToken::new();
295298
Self {
296299
msock,
297-
relay_recv_channel: recv_channel,
300+
relay_datagrams_queue,
298301
connected_relays: Default::default(),
299302
ping_tasks: Default::default(),
300303
cancel_token,
@@ -536,11 +539,15 @@ impl RelayActor {
536539
let handle = tokio::task::spawn({
537540
let url = url.clone();
538541
let relay_client = relay_client.clone();
539-
let relay_recv_channel = self.relay_recv_channel.clone();
542+
let relay_datagrams_queue = self.relay_datagrams_queue.clone();
540543
let span = info_span!("conn-relay-actor", %url);
541544
async move {
542-
let conn_actor =
543-
ConnectedRelayActor::new(url, relay_client, relay_receiver, relay_recv_channel);
545+
let conn_actor = ConnectedRelayActor::new(
546+
url,
547+
relay_client,
548+
relay_receiver,
549+
relay_datagrams_queue,
550+
);
544551

545552
if let Err(err) = conn_actor.run(conn_actor_inbox_rx).await {
546553
warn!("connection error: {:?}", err);

0 commit comments

Comments
 (0)