Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit c3a4780

Browse files
author
David Robertson
authored
When restarting a partial join resync, prioritise the server which actioned a partial join (#14126)
1 parent 4af93bd commit c3a4780

File tree

6 files changed

+95
-31
lines changed

6 files changed

+95
-31
lines changed

changelog.d/14126.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Faster joins: prioritise the server we joined by when restarting a partial join resync.

synapse/handlers/device.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,10 @@ async def incoming_device_list_update(
937937
# Check if we are partially joining any rooms. If so we need to store
938938
# all device list updates so that we can handle them correctly once we
939939
# know who is in the room.
940-
partial_rooms = await self.store.get_partial_state_rooms_and_servers()
940+
# TODO(faster joins): this fetches and processes a bunch of data that we don't
941+
# use. Could be replaced by a tighter query e.g.
942+
# SELECT EXISTS(SELECT 1 FROM partial_state_rooms)
943+
partial_rooms = await self.store.get_partial_state_room_resync_info()
941944
if partial_rooms:
942945
await self.store.add_remote_device_list_to_pending(
943946
user_id,

synapse/handlers/federation.py

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,7 @@ async def do_invite_join(
632632
room_id=room_id,
633633
servers=ret.servers_in_room,
634634
device_lists_stream_id=self.store.get_device_stream_token(),
635+
joined_via=origin,
635636
)
636637

637638
try:
@@ -1615,13 +1616,13 @@ async def _resume_sync_partial_state_room(self) -> None:
16151616
"""Resumes resyncing of all partial-state rooms after a restart."""
16161617
assert not self.config.worker.worker_app
16171618

1618-
partial_state_rooms = await self.store.get_partial_state_rooms_and_servers()
1619-
for room_id, servers_in_room in partial_state_rooms.items():
1619+
partial_state_rooms = await self.store.get_partial_state_room_resync_info()
1620+
for room_id, resync_info in partial_state_rooms.items():
16201621
run_as_background_process(
16211622
desc="sync_partial_state_room",
16221623
func=self._sync_partial_state_room,
1623-
initial_destination=None,
1624-
other_destinations=servers_in_room,
1624+
initial_destination=resync_info.joined_via,
1625+
other_destinations=resync_info.servers_in_room,
16251626
room_id=room_id,
16261627
)
16271628

@@ -1650,28 +1651,12 @@ async def _sync_partial_state_room(
16501651
# really leave, that might mean we have difficulty getting the room state over
16511652
# federation.
16521653
# https://github.com/matrix-org/synapse/issues/12802
1653-
#
1654-
# TODO(faster_joins): we need some way of prioritising which homeservers in
1655-
# `other_destinations` to try first, otherwise we'll spend ages trying dead
1656-
# homeservers for large rooms.
1657-
# https://github.com/matrix-org/synapse/issues/12999
1658-
1659-
if initial_destination is None and len(other_destinations) == 0:
1660-
raise ValueError(
1661-
f"Cannot resync state of {room_id}: no destinations provided"
1662-
)
16631654

16641655
# Make an infinite iterator of destinations to try. Once we find a working
16651656
# destination, we'll stick with it until it flakes.
1666-
destinations: Collection[str]
1667-
if initial_destination is not None:
1668-
# Move `initial_destination` to the front of the list.
1669-
destinations = list(other_destinations)
1670-
if initial_destination in destinations:
1671-
destinations.remove(initial_destination)
1672-
destinations = [initial_destination] + destinations
1673-
else:
1674-
destinations = other_destinations
1657+
destinations = _prioritise_destinations_for_partial_state_resync(
1658+
initial_destination, other_destinations, room_id
1659+
)
16751660
destination_iter = itertools.cycle(destinations)
16761661

16771662
# `destination` is the current remote homeserver we're pulling from.
@@ -1769,3 +1754,29 @@ async def _sync_partial_state_room(
17691754
room_id,
17701755
destination,
17711756
)
1757+
1758+
1759+
def _prioritise_destinations_for_partial_state_resync(
1760+
initial_destination: Optional[str],
1761+
other_destinations: Collection[str],
1762+
room_id: str,
1763+
) -> Collection[str]:
1764+
"""Work out the order in which we should ask servers to resync events.
1765+
1766+
If an `initial_destination` is given, it takes top priority. Otherwise
1767+
all servers are treated equally.
1768+
1769+
:raises ValueError: if no destination is provided at all.
1770+
"""
1771+
if initial_destination is None and len(other_destinations) == 0:
1772+
raise ValueError(f"Cannot resync state of {room_id}: no destinations provided")
1773+
1774+
if initial_destination is None:
1775+
return other_destinations
1776+
1777+
# Move `initial_destination` to the front of the list.
1778+
destinations = list(other_destinations)
1779+
if initial_destination in destinations:
1780+
destinations.remove(initial_destination)
1781+
destinations = [initial_destination] + destinations
1782+
return destinations

synapse/storage/database.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1658,7 +1658,7 @@ async def simple_select_one_onecol(
16581658
table: string giving the table name
16591659
keyvalues: dict of column names and values to select the row with
16601660
retcol: string giving the name of the column to return
1661-
allow_none: If true, return None instead of failing if the SELECT
1661+
allow_none: If true, return None instead of raising StoreError if the SELECT
16621662
statement returns no rows
16631663
desc: description of the transaction, for logging and metrics
16641664
"""

synapse/storage/databases/main/room.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ class RoomSortOrder(Enum):
9797
STATE_EVENTS = "state_events"
9898

9999

100+
@attr.s(slots=True, frozen=True, auto_attribs=True)
101+
class PartialStateResyncInfo:
102+
joined_via: Optional[str]
103+
servers_in_room: List[str] = attr.ib(factory=list)
104+
105+
100106
class RoomWorkerStore(CacheInvalidationWorkerStore):
101107
def __init__(
102108
self,
@@ -1160,17 +1166,29 @@ async def get_partial_state_servers_at_join(self, room_id: str) -> Sequence[str]
11601166
desc="get_partial_state_servers_at_join",
11611167
)
11621168

1163-
async def get_partial_state_rooms_and_servers(
1169+
async def get_partial_state_room_resync_info(
11641170
self,
1165-
) -> Mapping[str, Collection[str]]:
1166-
"""Get all rooms containing events with partial state, and the servers known
1167-
to be in the room.
1171+
) -> Mapping[str, PartialStateResyncInfo]:
1172+
"""Get all rooms containing events with partial state, and the information
1173+
needed to restart a "resync" of those rooms.
11681174
11691175
Returns:
11701176
A dictionary of rooms with partial state, with room IDs as keys and
11711177
lists of servers in rooms as values.
11721178
"""
1173-
room_servers: Dict[str, List[str]] = {}
1179+
room_servers: Dict[str, PartialStateResyncInfo] = {}
1180+
1181+
rows = await self.db_pool.simple_select_list(
1182+
table="partial_state_rooms",
1183+
keyvalues={},
1184+
retcols=("room_id", "joined_via"),
1185+
desc="get_server_which_served_partial_join",
1186+
)
1187+
1188+
for row in rows:
1189+
room_id = row["room_id"]
1190+
joined_via = row["joined_via"]
1191+
room_servers[room_id] = PartialStateResyncInfo(joined_via=joined_via)
11741192

11751193
rows = await self.db_pool.simple_select_list(
11761194
"partial_state_rooms_servers",
@@ -1182,7 +1200,15 @@ async def get_partial_state_rooms_and_servers(
11821200
for row in rows:
11831201
room_id = row["room_id"]
11841202
server_name = row["server_name"]
1185-
room_servers.setdefault(room_id, []).append(server_name)
1203+
entry = room_servers.get(room_id)
1204+
if entry is None:
1205+
# There is a foreign key constraint which enforces that every room_id in
1206+
# partial_state_rooms_servers appears in partial_state_rooms. So we
1207+
# expect `entry` to be non-null. (This reasoning fails if we've
1208+
# partial-joined between the two SELECTs, but this is unlikely to happen
1209+
# in practice.)
1210+
continue
1211+
entry.servers_in_room.append(server_name)
11861212

11871213
return room_servers
11881214

@@ -1827,6 +1853,7 @@ async def store_partial_state_room(
18271853
room_id: str,
18281854
servers: Collection[str],
18291855
device_lists_stream_id: int,
1856+
joined_via: str,
18301857
) -> None:
18311858
"""Mark the given room as containing events with partial state.
18321859
@@ -1842,13 +1869,15 @@ async def store_partial_state_room(
18421869
servers: other servers known to be in the room
18431870
device_lists_stream_id: the device_lists stream ID at the time when we first
18441871
joined the room.
1872+
joined_via: the server name we requested a partial join from.
18451873
"""
18461874
await self.db_pool.runInteraction(
18471875
"store_partial_state_room",
18481876
self._store_partial_state_room_txn,
18491877
room_id,
18501878
servers,
18511879
device_lists_stream_id,
1880+
joined_via,
18521881
)
18531882

18541883
def _store_partial_state_room_txn(
@@ -1857,6 +1886,7 @@ def _store_partial_state_room_txn(
18571886
room_id: str,
18581887
servers: Collection[str],
18591888
device_lists_stream_id: int,
1889+
joined_via: str,
18601890
) -> None:
18611891
DatabasePool.simple_insert_txn(
18621892
txn,
@@ -1866,6 +1896,7 @@ def _store_partial_state_room_txn(
18661896
"device_lists_stream_id": device_lists_stream_id,
18671897
# To be updated later once the join event is persisted.
18681898
"join_event_id": None,
1899+
"joined_via": joined_via,
18691900
},
18701901
)
18711902
DatabasePool.simple_insert_many_txn(
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/* Copyright 2022 The Matrix.org Foundation C.I.C
2+
*
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
-- When we resync partial state, we prioritise doing so using the server we
17+
-- partial-joined from. To do this we need to record which server that was!
18+
ALTER TABLE partial_state_rooms ADD COLUMN joined_via TEXT;

0 commit comments

Comments
 (0)