Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit bc1beeb

Browse files
authored
Refactor have_seen_events to reduce OOMs (#12886)
My server is currently OOMing in the middle of have_seen_events, so let's try to fix that.
1 parent 49f0686 commit bc1beeb

File tree

2 files changed

+25
-18
lines changed

2 files changed

+25
-18
lines changed

changelog.d/12886.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Refactor `have_seen_events` to reduce memory consumed when processing federation traffic.

synapse/storage/databases/main/events_worker.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,14 +1356,23 @@ async def have_seen_events(
13561356
Returns:
13571357
The set of events we have already seen.
13581358
"""
1359-
res = await self._have_seen_events_dict(
1360-
(room_id, event_id) for event_id in event_ids
1361-
)
1362-
return {eid for ((_rid, eid), have_event) in res.items() if have_event}
1359+
1360+
# @cachedList chomps lots of memory if you call it with a big list, so
1361+
# we break it down. However, each batch requires its own index scan, so we make
1362+
# the batches as big as possible.
1363+
1364+
results: Set[str] = set()
1365+
for chunk in batch_iter(event_ids, 500):
1366+
r = await self._have_seen_events_dict(
1367+
[(room_id, event_id) for event_id in chunk]
1368+
)
1369+
results.update(eid for ((_rid, eid), have_event) in r.items() if have_event)
1370+
1371+
return results
13631372

13641373
@cachedList(cached_method_name="have_seen_event", list_name="keys")
13651374
async def _have_seen_events_dict(
1366-
self, keys: Iterable[Tuple[str, str]]
1375+
self, keys: Collection[Tuple[str, str]]
13671376
) -> Dict[Tuple[str, str], bool]:
13681377
"""Helper for have_seen_events
13691378
@@ -1375,33 +1384,30 @@ async def _have_seen_events_dict(
13751384
cache_results = {
13761385
(rid, eid) for (rid, eid) in keys if self._get_event_cache.contains((eid,))
13771386
}
1378-
results = {x: True for x in cache_results}
1387+
results = dict.fromkeys(cache_results, True)
1388+
remaining = [k for k in keys if k not in cache_results]
1389+
if not remaining:
1390+
return results
13791391

1380-
def have_seen_events_txn(
1381-
txn: LoggingTransaction, chunk: Tuple[Tuple[str, str], ...]
1382-
) -> None:
1392+
def have_seen_events_txn(txn: LoggingTransaction) -> None:
13831393
# we deliberately do *not* query the database for room_id, to make the
13841394
# query an index-only lookup on `events_event_id_key`.
13851395
#
13861396
# We therefore pull the events from the database into a set...
13871397

13881398
sql = "SELECT event_id FROM events AS e WHERE "
13891399
clause, args = make_in_list_sql_clause(
1390-
txn.database_engine, "e.event_id", [eid for (_rid, eid) in chunk]
1400+
txn.database_engine, "e.event_id", [eid for (_rid, eid) in remaining]
13911401
)
13921402
txn.execute(sql + clause, args)
13931403
found_events = {eid for eid, in txn}
13941404

1395-
# ... and then we can update the results for each row in the batch
1396-
results.update({(rid, eid): (eid in found_events) for (rid, eid) in chunk})
1397-
1398-
# each batch requires its own index scan, so we make the batches as big as
1399-
# possible.
1400-
for chunk in batch_iter((k for k in keys if k not in cache_results), 500):
1401-
await self.db_pool.runInteraction(
1402-
"have_seen_events", have_seen_events_txn, chunk
1405+
# ... and then we can update the results for each key
1406+
results.update(
1407+
{(rid, eid): (eid in found_events) for (rid, eid) in remaining}
14031408
)
14041409

1410+
await self.db_pool.runInteraction("have_seen_events", have_seen_events_txn)
14051411
return results
14061412

14071413
@cached(max_entries=100000, tree=True)

0 commit comments

Comments
 (0)