Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit d35bed8

Browse files
authored
Don't wake up destination transaction queue if they're not due for retry. (#16223)
1 parent dcb2778 commit d35bed8

File tree

15 files changed

+228
-90
lines changed

15 files changed

+228
-90
lines changed

changelog.d/16223.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve resource usage when sending data to a large number of remote hosts that are marked as "down".

synapse/federation/send_queue.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
from synapse.federation.sender import AbstractFederationSender, FederationSender
5050
from synapse.metrics import LaterGauge
5151
from synapse.replication.tcp.streams.federation import FederationStream
52-
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken
52+
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken, StrCollection
5353
from synapse.util.metrics import Measure
5454

5555
from .units import Edu
@@ -229,7 +229,7 @@ async def send_read_receipt(self, receipt: ReadReceipt) -> None:
229229
"""
230230
# nothing to do here: the replication listener will handle it.
231231

232-
def send_presence_to_destinations(
232+
async def send_presence_to_destinations(
233233
self, states: Iterable[UserPresenceState], destinations: Iterable[str]
234234
) -> None:
235235
"""As per FederationSender
@@ -245,7 +245,9 @@ def send_presence_to_destinations(
245245

246246
self.notifier.on_new_replication_data()
247247

248-
def send_device_messages(self, destination: str, immediate: bool = True) -> None:
248+
async def send_device_messages(
249+
self, destinations: StrCollection, immediate: bool = True
250+
) -> None:
249251
"""As per FederationSender"""
250252
# We don't need to replicate this as it gets sent down a different
251253
# stream.
@@ -463,7 +465,7 @@ class ParsedFederationStreamData:
463465
edus: Dict[str, List[Edu]]
464466

465467

466-
def process_rows_for_federation(
468+
async def process_rows_for_federation(
467469
transaction_queue: FederationSender,
468470
rows: List[FederationStream.FederationStreamRow],
469471
) -> None:
@@ -496,7 +498,7 @@ def process_rows_for_federation(
496498
parsed_row.add_to_buffer(buff)
497499

498500
for state, destinations in buff.presence_destinations:
499-
transaction_queue.send_presence_to_destinations(
501+
await transaction_queue.send_presence_to_destinations(
500502
states=[state], destinations=destinations
501503
)
502504

synapse/federation/sender/__init__.py

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,10 @@
147147
import synapse.metrics
148148
from synapse.api.presence import UserPresenceState
149149
from synapse.events import EventBase
150-
from synapse.federation.sender.per_destination_queue import PerDestinationQueue
150+
from synapse.federation.sender.per_destination_queue import (
151+
CATCHUP_RETRY_INTERVAL,
152+
PerDestinationQueue,
153+
)
151154
from synapse.federation.sender.transaction_manager import TransactionManager
152155
from synapse.federation.units import Edu
153156
from synapse.logging.context import make_deferred_yieldable, run_in_background
@@ -161,9 +164,10 @@
161164
run_as_background_process,
162165
wrap_as_background_process,
163166
)
164-
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken
167+
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken, StrCollection
165168
from synapse.util import Clock
166169
from synapse.util.metrics import Measure
170+
from synapse.util.retryutils import filter_destinations_by_retry_limiter
167171

168172
if TYPE_CHECKING:
169173
from synapse.events.presence_router import PresenceRouter
@@ -213,7 +217,7 @@ async def send_read_receipt(self, receipt: ReadReceipt) -> None:
213217
raise NotImplementedError()
214218

215219
@abc.abstractmethod
216-
def send_presence_to_destinations(
220+
async def send_presence_to_destinations(
217221
self, states: Iterable[UserPresenceState], destinations: Iterable[str]
218222
) -> None:
219223
"""Send the given presence states to the given destinations.
@@ -242,9 +246,11 @@ def build_and_send_edu(
242246
raise NotImplementedError()
243247

244248
@abc.abstractmethod
245-
def send_device_messages(self, destination: str, immediate: bool = True) -> None:
249+
async def send_device_messages(
250+
self, destinations: StrCollection, immediate: bool = True
251+
) -> None:
246252
"""Tells the sender that a new device message is ready to be sent to the
247-
destination. The `immediate` flag specifies whether the messages should
253+
destinations. The `immediate` flag specifies whether the messages should
248254
be tried to be sent immediately, or whether it can be delayed for a
249255
short while (to aid performance).
250256
"""
@@ -716,6 +722,13 @@ async def _send_pdu(self, pdu: EventBase, destinations: Iterable[str]) -> None:
716722
pdu.internal_metadata.stream_ordering,
717723
)
718724

725+
destinations = await filter_destinations_by_retry_limiter(
726+
destinations,
727+
clock=self.clock,
728+
store=self.store,
729+
retry_due_within_ms=CATCHUP_RETRY_INTERVAL,
730+
)
731+
719732
for destination in destinations:
720733
self._get_per_destination_queue(destination).send_pdu(pdu)
721734

@@ -763,12 +776,20 @@ async def send_read_receipt(self, receipt: ReadReceipt) -> None:
763776
domains_set = await self._storage_controllers.state.get_current_hosts_in_room_or_partial_state_approximation(
764777
room_id
765778
)
766-
domains = [
779+
domains: StrCollection = [
767780
d
768781
for d in domains_set
769782
if not self.is_mine_server_name(d)
770783
and self._federation_shard_config.should_handle(self._instance_name, d)
771784
]
785+
786+
domains = await filter_destinations_by_retry_limiter(
787+
domains,
788+
clock=self.clock,
789+
store=self.store,
790+
retry_due_within_ms=CATCHUP_RETRY_INTERVAL,
791+
)
792+
772793
if not domains:
773794
return
774795

@@ -816,7 +837,7 @@ def _flush_rrs_for_room(self, room_id: str) -> None:
816837
for queue in queues:
817838
queue.flush_read_receipts_for_room(room_id)
818839

819-
def send_presence_to_destinations(
840+
async def send_presence_to_destinations(
820841
self, states: Iterable[UserPresenceState], destinations: Iterable[str]
821842
) -> None:
822843
"""Send the given presence states to the given destinations.
@@ -831,13 +852,20 @@ def send_presence_to_destinations(
831852
for state in states:
832853
assert self.is_mine_id(state.user_id)
833854

855+
destinations = await filter_destinations_by_retry_limiter(
856+
[
857+
d
858+
for d in destinations
859+
if self._federation_shard_config.should_handle(self._instance_name, d)
860+
],
861+
clock=self.clock,
862+
store=self.store,
863+
retry_due_within_ms=CATCHUP_RETRY_INTERVAL,
864+
)
865+
834866
for destination in destinations:
835867
if self.is_mine_server_name(destination):
836868
continue
837-
if not self._federation_shard_config.should_handle(
838-
self._instance_name, destination
839-
):
840-
continue
841869

842870
self._get_per_destination_queue(destination).send_presence(
843871
states, start_loop=False
@@ -896,21 +924,29 @@ def send_edu(self, edu: Edu, key: Optional[Hashable]) -> None:
896924
else:
897925
queue.send_edu(edu)
898926

899-
def send_device_messages(self, destination: str, immediate: bool = True) -> None:
900-
if self.is_mine_server_name(destination):
901-
logger.warning("Not sending device update to ourselves")
902-
return
903-
904-
if not self._federation_shard_config.should_handle(
905-
self._instance_name, destination
906-
):
907-
return
927+
async def send_device_messages(
928+
self, destinations: StrCollection, immediate: bool = True
929+
) -> None:
930+
destinations = await filter_destinations_by_retry_limiter(
931+
[
932+
destination
933+
for destination in destinations
934+
if self._federation_shard_config.should_handle(
935+
self._instance_name, destination
936+
)
937+
and not self.is_mine_server_name(destination)
938+
],
939+
clock=self.clock,
940+
store=self.store,
941+
retry_due_within_ms=CATCHUP_RETRY_INTERVAL,
942+
)
908943

909-
if immediate:
910-
self._get_per_destination_queue(destination).attempt_new_transaction()
911-
else:
912-
self._get_per_destination_queue(destination).mark_new_data()
913-
self._destination_wakeup_queue.add_to_queue(destination)
944+
for destination in destinations:
945+
if immediate:
946+
self._get_per_destination_queue(destination).attempt_new_transaction()
947+
else:
948+
self._get_per_destination_queue(destination).mark_new_data()
949+
self._destination_wakeup_queue.add_to_queue(destination)
914950

915951
def wake_destination(self, destination: str) -> None:
916952
"""Called when we want to retry sending transactions to a remote.

synapse/federation/sender/per_destination_queue.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@
5959
)
6060

6161

62+
# If the retry interval is larger than this then we enter "catchup" mode
63+
CATCHUP_RETRY_INTERVAL = 60 * 60 * 1000
64+
65+
6266
class PerDestinationQueue:
6367
"""
6468
Manages the per-destination transmission queues.
@@ -370,7 +374,7 @@ async def _transaction_transmission_loop(self) -> None:
370374
),
371375
)
372376

373-
if e.retry_interval > 60 * 60 * 1000:
377+
if e.retry_interval > CATCHUP_RETRY_INTERVAL:
374378
# we won't retry for another hour!
375379
# (this suggests a significant outage)
376380
# We drop pending EDUs because otherwise they will

synapse/handlers/device.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -836,17 +836,16 @@ async def _handle_new_device_update_async(self) -> None:
836836
user_id,
837837
hosts,
838838
)
839-
for host in hosts:
840-
self.federation_sender.send_device_messages(
841-
host, immediate=False
842-
)
843-
# TODO: when called, this isn't in a logging context.
844-
# This leads to log spam, sentry event spam, and massive
845-
# memory usage.
846-
# See https://github.com/matrix-org/synapse/issues/12552.
847-
# log_kv(
848-
# {"message": "sent device update to host", "host": host}
849-
# )
839+
await self.federation_sender.send_device_messages(
840+
hosts, immediate=False
841+
)
842+
# TODO: when called, this isn't in a logging context.
843+
# This leads to log spam, sentry event spam, and massive
844+
# memory usage.
845+
# See https://github.com/matrix-org/synapse/issues/12552.
846+
# log_kv(
847+
# {"message": "sent device update to host", "host": host}
848+
# )
850849

851850
if current_stream_id != stream_id:
852851
# Clear the set of hosts we've already sent to as we're
@@ -951,8 +950,9 @@ async def handle_room_un_partial_stated(self, room_id: str) -> None:
951950

952951
# Notify things that device lists need to be sent out.
953952
self.notifier.notify_replication()
954-
for host in potentially_changed_hosts:
955-
self.federation_sender.send_device_messages(host, immediate=False)
953+
await self.federation_sender.send_device_messages(
954+
potentially_changed_hosts, immediate=False
955+
)
956956

957957

958958
def _update_device_from_client_ips(

synapse/handlers/devicemessage.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,9 @@ async def send_device_message(
302302
)
303303

304304
if self.federation_sender:
305-
for destination in remote_messages.keys():
306-
# Enqueue a new federation transaction to send the new
307-
# device messages to each remote destination.
308-
self.federation_sender.send_device_messages(destination)
305+
# Enqueue a new federation transaction to send the new
306+
# device messages to each remote destination.
307+
await self.federation_sender.send_device_messages(remote_messages.keys())
309308

310309
async def get_events_for_dehydrated_device(
311310
self,

synapse/handlers/presence.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,9 @@ async def maybe_send_presence_to_interested_destinations(
354354
)
355355

356356
for destination, host_states in hosts_to_states.items():
357-
self._federation.send_presence_to_destinations(host_states, [destination])
357+
await self._federation.send_presence_to_destinations(
358+
host_states, [destination]
359+
)
358360

359361
async def send_full_presence_to_users(self, user_ids: StrCollection) -> None:
360362
"""
@@ -936,7 +938,7 @@ async def _update_states(
936938
)
937939

938940
for destination, states in hosts_to_states.items():
939-
self._federation_queue.send_presence_to_destinations(
941+
await self._federation_queue.send_presence_to_destinations(
940942
states, [destination]
941943
)
942944

@@ -1508,7 +1510,7 @@ async def _handle_state_delta(self, room_id: str, deltas: List[JsonDict]) -> Non
15081510
or state.status_msg is not None
15091511
]
15101512

1511-
self._federation_queue.send_presence_to_destinations(
1513+
await self._federation_queue.send_presence_to_destinations(
15121514
destinations=newly_joined_remote_hosts,
15131515
states=states,
15141516
)
@@ -1519,7 +1521,7 @@ async def _handle_state_delta(self, room_id: str, deltas: List[JsonDict]) -> Non
15191521
prev_remote_hosts or newly_joined_remote_hosts
15201522
):
15211523
local_states = await self.current_state_for_users(newly_joined_local_users)
1522-
self._federation_queue.send_presence_to_destinations(
1524+
await self._federation_queue.send_presence_to_destinations(
15231525
destinations=prev_remote_hosts | newly_joined_remote_hosts,
15241526
states=list(local_states.values()),
15251527
)
@@ -2182,7 +2184,7 @@ def _clear_queue(self) -> None:
21822184
index = bisect(self._queue, (clear_before,))
21832185
self._queue = self._queue[index:]
21842186

2185-
def send_presence_to_destinations(
2187+
async def send_presence_to_destinations(
21862188
self, states: Collection[UserPresenceState], destinations: StrCollection
21872189
) -> None:
21882190
"""Send the presence states to the given destinations.
@@ -2202,7 +2204,7 @@ def send_presence_to_destinations(
22022204
return
22032205

22042206
if self._federation:
2205-
self._federation.send_presence_to_destinations(
2207+
await self._federation.send_presence_to_destinations(
22062208
states=states,
22072209
destinations=destinations,
22082210
)
@@ -2325,7 +2327,7 @@ async def process_replication_rows(
23252327

23262328
for host, user_ids in hosts_to_users.items():
23272329
states = await self._presence_handler.current_state_for_users(user_ids)
2328-
self._federation.send_presence_to_destinations(
2330+
await self._federation.send_presence_to_destinations(
23292331
states=states.values(),
23302332
destinations=[host],
23312333
)

synapse/handlers/typing.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@
2626
)
2727
from synapse.replication.tcp.streams import TypingStream
2828
from synapse.streams import EventSource
29-
from synapse.types import JsonDict, Requester, StreamKeyType, UserID
29+
from synapse.types import JsonDict, Requester, StrCollection, StreamKeyType, UserID
3030
from synapse.util.caches.stream_change_cache import StreamChangeCache
3131
from synapse.util.metrics import Measure
32+
from synapse.util.retryutils import filter_destinations_by_retry_limiter
3233
from synapse.util.wheel_timer import WheelTimer
3334

3435
if TYPE_CHECKING:
@@ -150,8 +151,15 @@ async def _push_remote(self, member: RoomMember, typing: bool) -> None:
150151
now=now, obj=member, then=now + FEDERATION_PING_INTERVAL
151152
)
152153

153-
hosts = await self._storage_controllers.state.get_current_hosts_in_room(
154-
member.room_id
154+
hosts: StrCollection = (
155+
await self._storage_controllers.state.get_current_hosts_in_room(
156+
member.room_id
157+
)
158+
)
159+
hosts = await filter_destinations_by_retry_limiter(
160+
hosts,
161+
clock=self.clock,
162+
store=self.store,
155163
)
156164
for domain in hosts:
157165
if not self.is_mine_server_name(domain):

synapse/module_api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1180,7 +1180,7 @@ async def send_local_online_presence_to(self, users: Iterable[str]) -> None:
11801180

11811181
# Send to remote destinations.
11821182
destination = UserID.from_string(user).domain
1183-
presence_handler.get_federation_queue().send_presence_to_destinations(
1183+
await presence_handler.get_federation_queue().send_presence_to_destinations(
11841184
presence_events, [destination]
11851185
)
11861186

0 commit comments

Comments
 (0)