Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit aeaa36d

Browse files
committed
Instrument the federation/backfill part of /messages
Split out from #13440
1 parent 1b09b08 commit aeaa36d

File tree

10 files changed

+164
-23
lines changed

10 files changed

+164
-23
lines changed

synapse/federation/federation_client.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
)
6262
from synapse.federation.transport.client import SendJoinResponse
6363
from synapse.http.types import QueryParams
64-
from synapse.logging.opentracing import trace
64+
from synapse.logging.opentracing import SynapseTags, set_tag, tag_args, trace
6565
from synapse.types import JsonDict, UserID, get_domain_from_id
6666
from synapse.util.async_helpers import concurrently_execute
6767
from synapse.util.caches.expiringcache import ExpiringCache
@@ -235,6 +235,7 @@ async def claim_client_keys(
235235
)
236236

237237
@trace
238+
@tag_args
238239
async def backfill(
239240
self, dest: str, room_id: str, limit: int, extremities: Collection[str]
240241
) -> Optional[List[EventBase]]:
@@ -337,6 +338,8 @@ async def get_pdu_from_destination_raw(
337338

338339
return None
339340

341+
@trace
342+
@tag_args
340343
async def get_pdu(
341344
self,
342345
destinations: Iterable[str],
@@ -448,6 +451,8 @@ async def get_pdu(
448451

449452
return event_copy
450453

454+
@trace
455+
@tag_args
451456
async def get_room_state_ids(
452457
self, destination: str, room_id: str, event_id: str
453458
) -> Tuple[List[str], List[str]]:
@@ -467,13 +472,24 @@ async def get_room_state_ids(
467472
state_event_ids = result["pdu_ids"]
468473
auth_event_ids = result.get("auth_chain_ids", [])
469474

475+
set_tag(
476+
SynapseTags.RESULT_PREFIX + f"state_event_ids ({len(state_event_ids)})",
477+
str(state_event_ids),
478+
)
479+
set_tag(
480+
SynapseTags.RESULT_PREFIX + f"auth_event_ids ({len(auth_event_ids)})",
481+
str(auth_event_ids),
482+
)
483+
470484
if not isinstance(state_event_ids, list) or not isinstance(
471485
auth_event_ids, list
472486
):
473487
raise InvalidResponseError("invalid response from /state_ids")
474488

475489
return state_event_ids, auth_event_ids
476490

491+
@trace
492+
@tag_args
477493
async def get_room_state(
478494
self,
479495
destination: str,
@@ -533,6 +549,7 @@ async def get_room_state(
533549

534550
return valid_state_events, valid_auth_events
535551

552+
@trace
536553
async def _check_sigs_and_hash_and_fetch(
537554
self,
538555
origin: str,

synapse/handlers/federation.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
from synapse.federation.federation_client import InvalidResponseError
6060
from synapse.http.servlet import assert_params_in_dict
6161
from synapse.logging.context import nested_logging_context
62-
from synapse.logging.opentracing import trace
62+
from synapse.logging.opentracing import SynapseTags, set_tag, trace
6363
from synapse.metrics.background_process_metrics import run_as_background_process
6464
from synapse.module_api import NOT_SPAM
6565
from synapse.replication.http.federation import (
@@ -370,6 +370,11 @@ async def _maybe_backfill_inner(
370370
logger.debug(
371371
"_maybe_backfill_inner: extremities_to_request %s", extremities_to_request
372372
)
373+
set_tag(
374+
SynapseTags.RESULT_PREFIX
375+
+ f"extremities_to_request {len(extremities_to_request)}",
376+
str(extremities_to_request),
377+
)
373378

374379
# Now we need to decide which hosts to hit first.
375380

synapse/handlers/federation_event.py

Lines changed: 73 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,13 @@
5959
from synapse.events.snapshot import EventContext
6060
from synapse.federation.federation_client import InvalidResponseError
6161
from synapse.logging.context import nested_logging_context
62-
from synapse.logging.opentracing import trace
62+
from synapse.logging.opentracing import (
63+
SynapseTags,
64+
set_tag,
65+
start_active_span,
66+
tag_args,
67+
trace,
68+
)
6369
from synapse.metrics.background_process_metrics import run_as_background_process
6470
from synapse.replication.http.devices import ReplicationUserDevicesResyncRestServlet
6571
from synapse.replication.http.federation import (
@@ -410,6 +416,7 @@ async def check_join_restrictions(
410416
prev_member_event,
411417
)
412418

419+
@trace
413420
async def process_remote_join(
414421
self,
415422
origin: str,
@@ -715,7 +722,7 @@ async def _get_missing_events_for_pdu(
715722

716723
@trace
717724
async def _process_pulled_events(
718-
self, origin: str, events: Iterable[EventBase], backfilled: bool
725+
self, origin: str, events: List[EventBase], backfilled: bool
719726
) -> None:
720727
"""Process a batch of events we have pulled from a remote server
721728
@@ -730,6 +737,11 @@ async def _process_pulled_events(
730737
backfilled: True if this is part of a historical batch of events (inhibits
731738
notification to clients, and validation of device keys.)
732739
"""
740+
set_tag(
741+
SynapseTags.FUNC_ARG_PREFIX + f"event_ids ({len(events)})",
742+
str([event.event_id for event in events]),
743+
)
744+
set_tag(SynapseTags.FUNC_ARG_PREFIX + "backfilled", str(backfilled))
733745
logger.debug(
734746
"processing pulled backfilled=%s events=%s",
735747
backfilled,
@@ -753,6 +765,7 @@ async def _process_pulled_events(
753765
await self._process_pulled_event(origin, ev, backfilled=backfilled)
754766

755767
@trace
768+
@tag_args
756769
async def _process_pulled_event(
757770
self, origin: str, event: EventBase, backfilled: bool
758771
) -> None:
@@ -854,6 +867,7 @@ async def _process_pulled_event(
854867
else:
855868
raise
856869

870+
@trace
857871
async def _compute_event_context_with_maybe_missing_prevs(
858872
self, dest: str, event: EventBase
859873
) -> EventContext:
@@ -970,6 +984,8 @@ async def _compute_event_context_with_maybe_missing_prevs(
970984
event, state_ids_before_event=state_map, partial_state=partial_state
971985
)
972986

987+
@trace
988+
@tag_args
973989
async def _get_state_ids_after_missing_prev_event(
974990
self,
975991
destination: str,
@@ -1009,10 +1025,10 @@ async def _get_state_ids_after_missing_prev_event(
10091025
logger.debug("Fetching %i events from cache/store", len(desired_events))
10101026
have_events = await self._store.have_seen_events(room_id, desired_events)
10111027

1012-
missing_desired_events = desired_events - have_events
1028+
missing_desired_event_ids = desired_events - have_events
10131029
logger.debug(
10141030
"We are missing %i events (got %i)",
1015-
len(missing_desired_events),
1031+
len(missing_desired_event_ids),
10161032
len(have_events),
10171033
)
10181034

@@ -1024,13 +1040,24 @@ async def _get_state_ids_after_missing_prev_event(
10241040
# already have a bunch of the state events. It would be nice if the
10251041
# federation api gave us a way of finding out which we actually need.
10261042

1027-
missing_auth_events = set(auth_event_ids) - have_events
1028-
missing_auth_events.difference_update(
1029-
await self._store.have_seen_events(room_id, missing_auth_events)
1043+
missing_auth_event_ids = set(auth_event_ids) - have_events
1044+
missing_auth_event_ids.difference_update(
1045+
await self._store.have_seen_events(room_id, missing_auth_event_ids)
10301046
)
1031-
logger.debug("We are also missing %i auth events", len(missing_auth_events))
1047+
logger.debug("We are also missing %i auth events", len(missing_auth_event_ids))
1048+
1049+
missing_event_ids = missing_desired_event_ids | missing_auth_event_ids
10321050

1033-
missing_events = missing_desired_events | missing_auth_events
1051+
set_tag(
1052+
SynapseTags.RESULT_PREFIX
1053+
+ f"missing_auth_event_ids ({len(missing_auth_event_ids)})",
1054+
str(missing_auth_event_ids),
1055+
)
1056+
set_tag(
1057+
SynapseTags.RESULT_PREFIX
1058+
+ f"missing_desired_event_ids ({len(missing_desired_event_ids)})",
1059+
str(missing_desired_event_ids),
1060+
)
10341061

10351062
# Making an individual request for each of 1000s of events has a lot of
10361063
# overhead. On the other hand, we don't really want to fetch all of the events
@@ -1041,13 +1068,13 @@ async def _get_state_ids_after_missing_prev_event(
10411068
#
10421069
# TODO: might it be better to have an API which lets us do an aggregate event
10431070
# request
1044-
if (len(missing_events) * 10) >= len(auth_event_ids) + len(state_event_ids):
1071+
if (len(missing_event_ids) * 10) >= len(auth_event_ids) + len(state_event_ids):
10451072
logger.debug("Requesting complete state from remote")
10461073
await self._get_state_and_persist(destination, room_id, event_id)
10471074
else:
1048-
logger.debug("Fetching %i events from remote", len(missing_events))
1075+
logger.debug("Fetching %i events from remote", len(missing_event_ids))
10491076
await self._get_events_and_persist(
1050-
destination=destination, room_id=room_id, event_ids=missing_events
1077+
destination=destination, room_id=room_id, event_ids=missing_event_ids
10511078
)
10521079

10531080
# We now need to fill out the state map, which involves fetching the
@@ -1104,6 +1131,10 @@ async def _get_state_ids_after_missing_prev_event(
11041131
event_id,
11051132
failed_to_fetch,
11061133
)
1134+
set_tag(
1135+
SynapseTags.RESULT_PREFIX + f"failed_to_fetch ({len(failed_to_fetch)})",
1136+
str(failed_to_fetch),
1137+
)
11071138

11081139
if remote_event.is_state() and remote_event.rejected_reason is None:
11091140
state_map[
@@ -1112,6 +1143,8 @@ async def _get_state_ids_after_missing_prev_event(
11121143

11131144
return state_map
11141145

1146+
@trace
1147+
@tag_args
11151148
async def _get_state_and_persist(
11161149
self, destination: str, room_id: str, event_id: str
11171150
) -> None:
@@ -1133,6 +1166,7 @@ async def _get_state_and_persist(
11331166
destination=destination, room_id=room_id, event_ids=(event_id,)
11341167
)
11351168

1169+
@trace
11361170
async def _process_received_pdu(
11371171
self,
11381172
origin: str,
@@ -1283,6 +1317,7 @@ async def _resync_device(self, sender: str) -> None:
12831317
except Exception:
12841318
logger.exception("Failed to resync device for %s", sender)
12851319

1320+
@trace
12861321
async def _handle_marker_event(self, origin: str, marker_event: EventBase) -> None:
12871322
"""Handles backfilling the insertion event when we receive a marker
12881323
event that points to one.
@@ -1414,6 +1449,8 @@ async def backfill_event_id(
14141449

14151450
return event_from_response
14161451

1452+
@trace
1453+
@tag_args
14171454
async def _get_events_and_persist(
14181455
self, destination: str, room_id: str, event_ids: Collection[str]
14191456
) -> None:
@@ -1459,6 +1496,7 @@ async def get_event(event_id: str) -> None:
14591496
logger.info("Fetched %i events of %i requested", len(events), len(event_ids))
14601497
await self._auth_and_persist_outliers(room_id, events)
14611498

1499+
@trace
14621500
async def _auth_and_persist_outliers(
14631501
self, room_id: str, events: Iterable[EventBase]
14641502
) -> None:
@@ -1477,6 +1515,12 @@ async def _auth_and_persist_outliers(
14771515
"""
14781516
event_map = {event.event_id: event for event in events}
14791517

1518+
event_ids = event_map.keys()
1519+
set_tag(
1520+
SynapseTags.FUNC_ARG_PREFIX + f"event_ids ({len(event_ids)})",
1521+
str(event_ids),
1522+
)
1523+
14801524
# filter out any events we have already seen. This might happen because
14811525
# the events were eagerly pushed to us (eg, during a room join), or because
14821526
# another thread has raced against us since we decided to request the event.
@@ -1593,6 +1637,7 @@ async def prep(event: EventBase) -> None:
15931637
backfilled=True,
15941638
)
15951639

1640+
@trace
15961641
async def _check_event_auth(
15971642
self, origin: Optional[str], event: EventBase, context: EventContext
15981643
) -> None:
@@ -1631,6 +1676,11 @@ async def _check_event_auth(
16311676
claimed_auth_events = await self._load_or_fetch_auth_events_for_event(
16321677
origin, event
16331678
)
1679+
set_tag(
1680+
SynapseTags.RESULT_PREFIX
1681+
+ f"claimed_auth_events ({len(claimed_auth_events)})",
1682+
str([ev.event_id for ev in claimed_auth_events]),
1683+
)
16341684

16351685
# ... and check that the event passes auth at those auth events.
16361686
# https://spec.matrix.org/v1.3/server-server-api/#checks-performed-on-receipt-of-a-pdu:
@@ -1728,6 +1778,7 @@ async def _check_event_auth(
17281778
)
17291779
context.rejected = RejectedReason.AUTH_ERROR
17301780

1781+
@trace
17311782
async def _maybe_kick_guest_users(self, event: EventBase) -> None:
17321783
if event.type != EventTypes.GuestAccess:
17331784
return
@@ -1935,6 +1986,8 @@ async def _load_or_fetch_auth_events_for_event(
19351986
# instead we raise an AuthError, which will make the caller ignore it.
19361987
raise AuthError(code=HTTPStatus.FORBIDDEN, msg="Auth events could not be found")
19371988

1989+
@trace
1990+
@tag_args
19381991
async def _get_remote_auth_chain_for_event(
19391992
self, destination: str, room_id: str, event_id: str
19401993
) -> None:
@@ -1963,6 +2016,7 @@ async def _get_remote_auth_chain_for_event(
19632016

19642017
await self._auth_and_persist_outliers(room_id, remote_auth_events)
19652018

2019+
@trace
19662020
async def _run_push_actions_and_persist_event(
19672021
self, event: EventBase, context: EventContext, backfilled: bool = False
19682022
) -> None:
@@ -2071,8 +2125,13 @@ async def persist_events_and_notify(
20712125
self._message_handler.maybe_schedule_expiry(event)
20722126

20732127
if not backfilled: # Never notify for backfilled events
2074-
for event in events:
2075-
await self._notify_persisted_event(event, max_stream_token)
2128+
with start_active_span("notify_persisted_events"):
2129+
set_tag(
2130+
SynapseTags.RESULT_PREFIX + f"event_ids ({len(events)})",
2131+
str([ev.event_id for ev in events]),
2132+
)
2133+
for event in events:
2134+
await self._notify_persisted_event(event, max_stream_token)
20762135

20772136
return max_stream_token.stream
20782137

synapse/logging/opentracing.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,19 @@ class SynapseTags:
310310
# The name of the external cache
311311
CACHE_NAME = "cache.name"
312312

313+
# Used to tag function arguments
314+
#
315+
# Tag a named arg. The name of the argument should be appended to this
316+
# prefix
317+
FUNC_ARG_PREFIX = "ARG."
318+
# Tag extra variadic number of positional arguments (`def foo(first, second, *extras)`)
319+
FUNC_ARGS = "args"
320+
# Tag keyword args
321+
FUNC_KWARGS = "kwargs"
322+
323+
# Some intermediate result that's interesting to the function
324+
RESULT_PREFIX = "RESULT."
325+
313326

314327
class SynapseBaggage:
315328
FORCE_TRACING = "synapse-force-tracing"

0 commit comments

Comments
 (0)