Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 9b390a3

Browse files
committed
Stop getting missing prev_events after we already know their signature is invalid
Related to - #13622 - #13635 - #13676 Follow-up to #13815 which tracks event signature failures. This PR aims to stop us from trying to `_get_state_ids_after_missing_prev_event` after we already know that the prev_event will fail from a previous attempt To explain an exact scenario around `/messages` -> backfill, we call `/backfill` and first check the signatures of the 100 events. We see bad signature for `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` and `$zuOn2Rd2vsC7SUia3Hp3r6JSkSFKcc5j3QTTqW_0jDw` (both member events). Then we process the 98 events remaining that have valid signatures but one of the events references `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` as a `prev_event`. So we have to do the whole `_get_state_ids_after_missing_prev_event` rigmarole which pulls in those same events which fail again because the signatures are still invalid. - `backfill` - `outgoing-federation-request` `/backfill` - `_check_sigs_and_hash_and_fetch` - `_check_sigs_and_hash_and_fetch_one` for each event received over backfill - ❗ `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` fails with `Signature on retrieved event was invalid.`: `unable to verify signature for sender domain xxx: 401: Failed to find any key to satisfy: _FetchKeyRequest(...)` - ❗ `$zuOn2Rd2vsC7SUia3Hp3r6JSkSFKcc5j3QTTqW_0jDw` fails with `Signature on retrieved event was invalid.`: `unable to verify signature for sender domain xxx: 401: Failed to find any key to satisfy: _FetchKeyRequest(...)` - `_process_pulled_events` - `_process_pulled_event` for each validated event - ❗ Event `$Q0iMdqtz3IJYfZQU2Xk2WjB5NDF8Gg8cFSYYyKQgKJ0` references `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` as a `prev_event` which is missing so we try to get it - `_get_state_ids_after_missing_prev_event` - `outgoing-federation-request` `/state_ids` - ❗ `get_pdu` for `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` which fails the signature check again - ❗ `get_pdu` for `$zuOn2Rd2vsC7SUia3Hp3r6JSkSFKcc5j3QTTqW_0jDw` which fails the signature check With this PR, we no longer call the burdensome `_get_state_ids_after_missing_prev_event` because the signature failure will count as an attempt before we try to run this.
1 parent cfb4e88 commit 9b390a3

File tree

2 files changed

+87
-1
lines changed

2 files changed

+87
-1
lines changed

synapse/handlers/federation_event.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,12 @@ async def _compute_event_context_with_maybe_missing_prevs(
955955
seen = await self._store.have_events_in_timeline(prevs)
956956
missing_prevs = prevs - seen
957957

958+
# Filter out events we've tried to pull recently
959+
prevs_to_ignore = await self.store.filter_events_with_pull_attempt_backoff(
960+
room_id, missing_prevs
961+
)
962+
missing_prevs = missing_prevs - prevs_to_ignore
963+
958964
if not missing_prevs:
959965
return await self._state_handler.compute_event_context(event)
960966

synapse/storage/databases/main/event_federation.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import datetime
1415
import itertools
1516
import logging
1617
from queue import Empty, PriorityQueue
@@ -21,6 +22,7 @@
2122
Iterable,
2223
List,
2324
Optional,
25+
Sequence,
2426
Set,
2527
Tuple,
2628
cast,
@@ -43,7 +45,7 @@
4345
)
4446
from synapse.storage.databases.main.events_worker import EventsWorkerStore
4547
from synapse.storage.databases.main.signatures import SignatureWorkerStore
46-
from synapse.storage.engines import PostgresEngine
48+
from synapse.storage.engines import PostgresEngine, Sqlite3Engine
4749
from synapse.types import JsonDict
4850
from synapse.util import json_encoder
4951
from synapse.util.caches.descriptors import cached
@@ -72,6 +74,12 @@
7274

7375
logger = logging.getLogger(__name__)
7476

77+
PULLED_EVENT_BACKOFF_UPPER_BOUND_SECONDS: int = int(
78+
datetime.timedelta(days=7).total_seconds()
79+
)
80+
PULLED_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS: int = int(
81+
datetime.timedelta(hours=1).total_seconds()
82+
)
7583

7684
# All the info we need while iterating the DAG while backfilling
7785
@attr.s(frozen=True, slots=True, auto_attribs=True)
@@ -1339,6 +1347,78 @@ def _record_event_failed_pull_attempt_upsert_txn(
13391347

13401348
txn.execute(sql, (room_id, event_id, 1, self._clock.time_msec(), cause))
13411349

1350+
@trace
1351+
async def filter_events_with_pull_attempt_backoff(
1352+
self,
1353+
room_id: str,
1354+
event_ids: Sequence[str],
1355+
) -> List[str]:
1356+
"""
1357+
Filter out events that we've failed to pull before
1358+
recently. Uses exponential backoff.
1359+
1360+
Args:
1361+
room_id: The room that the events belong to
1362+
event_ids: A list of events to filter down
1363+
1364+
Returns:
1365+
List of event_ids that can be attempted to be pulled
1366+
"""
1367+
return await self.db_pool.runInteraction(
1368+
"filter_events_with_pull_attempt_backoff",
1369+
self._filter_events_with_pull_attempt_backoff_txn,
1370+
room_id,
1371+
event_ids,
1372+
)
1373+
1374+
def _filter_events_with_pull_attempt_backoff_txn(
1375+
self,
1376+
txn: LoggingTransaction,
1377+
room_id: str,
1378+
event_ids: Sequence[str],
1379+
) -> None:
1380+
where_event_ids_match_clause, values = make_in_list_sql_clause(
1381+
txn.database_engine, "event_id", event_ids
1382+
)
1383+
1384+
sql = """
1385+
SELECT event_id FROM event_failed_pull_attempts
1386+
WHERE
1387+
room_id = ?
1388+
%s /* where_event_ids_match_clause */
1389+
/**
1390+
* Exponential back-off (up to the upper bound) so we don't try to
1391+
* pull the same event over and over. ex. 2hr, 4hr, 8hr, 16hr, etc.
1392+
*
1393+
* We use `1 << n` as a power of 2 equivalent for compatibility
1394+
* with older SQLites. The left shift equivalent only works with
1395+
* powers of 2 because left shift is a binary operation (base-2).
1396+
* Otherwise, we would use `power(2, n)` or the power operator, `2^n`.
1397+
*/
1398+
AND (
1399+
event_id IS NULL
1400+
OR ? /* current_time */ >= last_attempt_ts + /*least*/%s((1 << num_attempts) * ? /* step */, ? /* upper bound */)
1401+
)
1402+
"""
1403+
1404+
if isinstance(self.database_engine, PostgresEngine):
1405+
least_function = "least"
1406+
elif isinstance(self.database_engine, Sqlite3Engine):
1407+
least_function = "min"
1408+
else:
1409+
raise RuntimeError("Unknown database engine")
1410+
1411+
txn.execute(
1412+
sql % (where_event_ids_match_clause, least_function),
1413+
(
1414+
room_id,
1415+
*values,
1416+
self._clock.time_msec(),
1417+
1000 * PULLED_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS,
1418+
1000 * PULLED_EVENT_BACKOFF_UPPER_BOUND_SECONDS,
1419+
),
1420+
)
1421+
13421422
async def get_missing_events(
13431423
self,
13441424
room_id: str,

0 commit comments

Comments
 (0)