Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit c1ef579

Browse files
authored
Add prometheus metrics to track federation delays (#8430)
Add a pair of federation metrics to track the delays in sending PDUs to/from particular servers.
1 parent 7941372 commit c1ef579

File tree

8 files changed

+88
-6
lines changed

8 files changed

+88
-6
lines changed

changelog.d/8430.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add prometheus metrics to track federation delays.

docs/sample_config.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ acme:
629629
#tls_fingerprints: [{"sha256": "<base64_encoded_sha256_fingerprint>"}]
630630

631631

632+
## Federation ##
632633

633634
# Restrict federation to the following whitelist of domains.
634635
# N.B. we recommend also firewalling your federation listener to limit
@@ -662,6 +663,17 @@ federation_ip_range_blacklist:
662663
- 'fe80::/64'
663664
- 'fc00::/7'
664665

666+
# Report prometheus metrics on the age of PDUs being sent to and received from
667+
# the following domains. This can be used to give an idea of "delay" on inbound
668+
# and outbound federation, though be aware that any delay can be due to problems
669+
# at either end or with the intermediate network.
670+
#
671+
# By default, no domains are monitored in this way.
672+
#
673+
#federation_metrics_domains:
674+
# - matrix.org
675+
# - example.com
676+
665677

666678
## Caching ##
667679

synapse/config/_util.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,17 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15-
from typing import Any, List
15+
from typing import Any, Iterable
1616

1717
import jsonschema
1818

1919
from synapse.config._base import ConfigError
2020
from synapse.types import JsonDict
2121

2222

23-
def validate_config(json_schema: JsonDict, config: Any, config_path: List[str]) -> None:
23+
def validate_config(
24+
json_schema: JsonDict, config: Any, config_path: Iterable[str]
25+
) -> None:
2426
"""Validates a config setting against a JsonSchema definition
2527
2628
This can be used to validate a section of the config file against a schema

synapse/config/federation.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717

1818
from netaddr import IPSet
1919

20-
from ._base import Config, ConfigError
20+
from synapse.config._base import Config, ConfigError
21+
from synapse.config._util import validate_config
2122

2223

2324
class FederationConfig(Config):
@@ -52,8 +53,18 @@ def read_config(self, config, **kwargs):
5253
"Invalid range(s) provided in federation_ip_range_blacklist: %s" % e
5354
)
5455

56+
federation_metrics_domains = config.get("federation_metrics_domains") or []
57+
validate_config(
58+
_METRICS_FOR_DOMAINS_SCHEMA,
59+
federation_metrics_domains,
60+
("federation_metrics_domains",),
61+
)
62+
self.federation_metrics_domains = set(federation_metrics_domains)
63+
5564
def generate_config_section(self, config_dir_path, server_name, **kwargs):
5665
return """\
66+
## Federation ##
67+
5768
# Restrict federation to the following whitelist of domains.
5869
# N.B. we recommend also firewalling your federation listener to limit
5970
# inbound federation traffic as early as possible, rather than relying
@@ -85,4 +96,18 @@ def generate_config_section(self, config_dir_path, server_name, **kwargs):
8596
- '::1/128'
8697
- 'fe80::/64'
8798
- 'fc00::/7'
99+
100+
# Report prometheus metrics on the age of PDUs being sent to and received from
101+
# the following domains. This can be used to give an idea of "delay" on inbound
102+
# and outbound federation, though be aware that any delay can be due to problems
103+
# at either end or with the intermediate network.
104+
#
105+
# By default, no domains are monitored in this way.
106+
#
107+
#federation_metrics_domains:
108+
# - matrix.org
109+
# - example.com
88110
"""
111+
112+
113+
_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}

synapse/config/homeserver.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,5 +92,4 @@ class HomeServerConfig(RootConfig):
9292
TracerConfig,
9393
WorkerConfig,
9494
RedisConfig,
95-
FederationConfig,
9695
]

synapse/config/tls.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,6 @@ def generate_config_section(
471471
# or by checking matrix.org/federationtester/api/report?server_name=$host
472472
#
473473
#tls_fingerprints: [{"sha256": "<base64_encoded_sha256_fingerprint>"}]
474-
475474
"""
476475
# Lowercase the string representation of boolean values
477476
% {

synapse/federation/federation_server.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
Union,
2929
)
3030

31-
from prometheus_client import Counter, Histogram
31+
from prometheus_client import Counter, Gauge, Histogram
3232

3333
from twisted.internet import defer
3434
from twisted.internet.abstract import isIPAddress
@@ -88,6 +88,13 @@
8888
)
8989

9090

91+
last_pdu_age_metric = Gauge(
92+
"synapse_federation_last_received_pdu_age",
93+
"The age (in seconds) of the last PDU successfully received from the given domain",
94+
labelnames=("server_name",),
95+
)
96+
97+
9198
class FederationServer(FederationBase):
9299
def __init__(self, hs):
93100
super().__init__(hs)
@@ -118,6 +125,10 @@ def __init__(self, hs):
118125
hs, "state_ids_resp", timeout_ms=30000
119126
)
120127

128+
self._federation_metrics_domains = (
129+
hs.get_config().federation.federation_metrics_domains
130+
)
131+
121132
async def on_backfill_request(
122133
self, origin: str, room_id: str, versions: List[str], limit: int
123134
) -> Tuple[int, Dict[str, Any]]:
@@ -262,7 +273,11 @@ async def _handle_pdus_in_txn(
262273

263274
pdus_by_room = {} # type: Dict[str, List[EventBase]]
264275

276+
newest_pdu_ts = 0
277+
265278
for p in transaction.pdus: # type: ignore
279+
# FIXME (richardv): I don't think this works:
280+
# https://github.com/matrix-org/synapse/issues/8429
266281
if "unsigned" in p:
267282
unsigned = p["unsigned"]
268283
if "age" in unsigned:
@@ -300,6 +315,9 @@ async def _handle_pdus_in_txn(
300315
event = event_from_pdu_json(p, room_version)
301316
pdus_by_room.setdefault(room_id, []).append(event)
302317

318+
if event.origin_server_ts > newest_pdu_ts:
319+
newest_pdu_ts = event.origin_server_ts
320+
303321
pdu_results = {}
304322

305323
# we can process different rooms in parallel (which is useful if they
@@ -340,6 +358,10 @@ async def process_pdus_for_room(room_id: str):
340358
process_pdus_for_room, pdus_by_room.keys(), TRANSACTION_CONCURRENCY_LIMIT
341359
)
342360

361+
if newest_pdu_ts and origin in self._federation_metrics_domains:
362+
newest_pdu_age = self._clock.time_msec() - newest_pdu_ts
363+
last_pdu_age_metric.labels(server_name=origin).set(newest_pdu_age / 1000)
364+
343365
return pdu_results
344366

345367
async def _handle_edus_in_txn(self, origin: str, transaction: Transaction):

synapse/federation/sender/transaction_manager.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import logging
1616
from typing import TYPE_CHECKING, List
1717

18+
from prometheus_client import Gauge
19+
1820
from synapse.api.errors import HttpResponseException
1921
from synapse.events import EventBase
2022
from synapse.federation.persistence import TransactionActions
@@ -34,6 +36,12 @@
3436

3537
logger = logging.getLogger(__name__)
3638

39+
last_pdu_age_metric = Gauge(
40+
"synapse_federation_last_sent_pdu_age",
41+
"The age (in seconds) of the last PDU successfully sent to the given domain",
42+
labelnames=("server_name",),
43+
)
44+
3745

3846
class TransactionManager:
3947
"""Helper class which handles building and sending transactions
@@ -48,6 +56,10 @@ def __init__(self, hs: "synapse.server.HomeServer"):
4856
self._transaction_actions = TransactionActions(self._store)
4957
self._transport_layer = hs.get_federation_transport_client()
5058

59+
self._federation_metrics_domains = (
60+
hs.get_config().federation.federation_metrics_domains
61+
)
62+
5163
# HACK to get unique tx id
5264
self._next_txn_id = int(self.clock.time_msec())
5365

@@ -119,6 +131,9 @@ async def send_new_transaction(
119131

120132
# FIXME (erikj): This is a bit of a hack to make the Pdu age
121133
# keys work
134+
# FIXME (richardv): I also believe it no longer works. We (now?) store
135+
# "age_ts" in "unsigned" rather than at the top level. See
136+
# https://github.com/matrix-org/synapse/issues/8429.
122137
def json_data_cb():
123138
data = transaction.get_dict()
124139
now = int(self.clock.time_msec())
@@ -167,5 +182,12 @@ def json_data_cb():
167182
)
168183
success = False
169184

185+
if success and pdus and destination in self._federation_metrics_domains:
186+
last_pdu = pdus[-1]
187+
last_pdu_age = self.clock.time_msec() - last_pdu.origin_server_ts
188+
last_pdu_age_metric.labels(server_name=destination).set(
189+
last_pdu_age / 1000
190+
)
191+
170192
set_tag(tags.ERROR, not success)
171193
return success

0 commit comments

Comments
 (0)