From 94fb46e3b68b7ead95eac2fd19c615f622487c11 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 16 Aug 2022 01:32:57 -0500 Subject: [PATCH 1/4] Time how long it takes us to do backfill processing The after timing can be influenced by slow `/state_ids` requests when proessing the pulled events. Part of https://github.com/matrix-org/synapse/issues/13356 --- synapse/handlers/federation.py | 26 +++++++++++++++ synapse/handlers/federation_event.py | 48 ++++++++++++++++++++-------- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 5042236742da..db90e626d225 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -32,6 +32,7 @@ ) import attr +from prometheus_client import Histogram from signedjson.key import decode_verify_key_bytes from signedjson.sign import verify_signed_json from unpaddedbase64 import decode_base64 @@ -80,6 +81,24 @@ logger = logging.getLogger(__name__) +backfill_processing_before_timer = Histogram( + "synapse_federation_backfill_processing_before_time_seconds", + "sec", + [], + buckets=( + 1.0, + 5.0, + 10.0, + 20.0, + 30.0, + 40.0, + 60.0, + 80.0, + "+Inf", + ), +) + + def get_domains_from_state(state: StateMap[EventBase]) -> List[Tuple[str, int]]: """Get joined domains from state @@ -138,6 +157,7 @@ class FederationHandler: def __init__(self, hs: "HomeServer"): self.hs = hs + self.clock = hs.get_clock() self.store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self._state_storage_controller = self._storage_controllers.state @@ -203,6 +223,7 @@ async def maybe_backfill( async def _maybe_backfill_inner( self, room_id: str, current_depth: int, limit: int ) -> bool: + processing_start_time = self.clock.time_msec() backwards_extremities = [ _BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY) for event_id, depth in await self.store.get_oldest_event_ids_with_depth_in_room( @@ -425,6 +446,11 @@ async def try_backfill(domains: List[str]) -> bool: return False + processing_end_time = self.clock.time_msec() + backfill_processing_before_timer.observe( + (processing_start_time - processing_end_time) / 1000 + ) + success = await try_backfill(likely_domains) if success: return True diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 8968b705d43a..986f2b61e57d 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -29,7 +29,7 @@ Tuple, ) -from prometheus_client import Counter +from prometheus_client import Counter, Histogram from synapse import event_auth from synapse.api.constants import ( @@ -92,6 +92,25 @@ "Events received over federation that we marked as soft_failed", ) +backfill_processing_after_timer = Histogram( + "synapse_federation_backfill_processing_after_time_seconds", + "sec", + [], + buckets=( + 1.0, + 5.0, + 10.0, + 20.0, + 30.0, + 40.0, + 60.0, + 80.0, + 120.0, + 180.0, + "+Inf", + ), +) + class FederationEventHandler: """Handles events that originated from federation. @@ -597,20 +616,21 @@ async def backfill( if not events: return - # if there are any events in the wrong room, the remote server is buggy and - # should not be trusted. - for ev in events: - if ev.room_id != room_id: - raise InvalidResponseError( - f"Remote server {dest} returned event {ev.event_id} which is in " - f"room {ev.room_id}, when we were backfilling in {room_id}" - ) + with backfill_processing_after_timer.time(): + # if there are any events in the wrong room, the remote server is buggy and + # should not be trusted. + for ev in events: + if ev.room_id != room_id: + raise InvalidResponseError( + f"Remote server {dest} returned event {ev.event_id} which is in " + f"room {ev.room_id}, when we were backfilling in {room_id}" + ) - await self._process_pulled_events( - dest, - events, - backfilled=True, - ) + await self._process_pulled_events( + dest, + events, + backfilled=True, + ) @trace async def _get_missing_events_for_pdu( From f9dc0dc05af84755c2575e2206ff08dcf5aecf76 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 16 Aug 2022 01:37:12 -0500 Subject: [PATCH 2/4] Add changelog --- changelog.d/13535.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/13535.misc diff --git a/changelog.d/13535.misc b/changelog.d/13535.misc new file mode 100644 index 000000000000..6b190181c813 --- /dev/null +++ b/changelog.d/13535.misc @@ -0,0 +1 @@ +Add metrics to time how long it takes us to do backfill processing (`synapse_federation_backfill_processing_before_time_seconds`, `synapse_federation_backfill_processing_after_time_seconds`). From 2c870086fbcf76f37ba0a19a539db6757be50f47 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 16 Aug 2022 02:03:41 -0500 Subject: [PATCH 3/4] Include the linearizer in the timing See https://github.com/matrix-org/synapse/pull/13535#discussion_r946391837 --- synapse/handlers/federation.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index db90e626d225..8c80c182474f 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -217,13 +217,36 @@ async def maybe_backfill( return. This is used as part of the heuristic to decide if we should back paginate. """ + processing_start_time = self.clock.time_msec() async with self._room_backfill.queue(room_id): - return await self._maybe_backfill_inner(room_id, current_depth, limit) + return await self._maybe_backfill_inner( + room_id, + current_depth, + limit, + processing_start_time=processing_start_time, + ) async def _maybe_backfill_inner( - self, room_id: str, current_depth: int, limit: int + self, + room_id: str, + current_depth: int, + limit: int, + *, + processing_start_time: int, ) -> bool: - processing_start_time = self.clock.time_msec() + """ + Checks whether the `current_depth` is at or approaching any backfill + points in the room and if so, will backfill. We only care about + checking backfill points that happened before the `current_depth` + (meaning less than or equal to the `current_depth`). + + Args: + room_id: The room to backfill in. + current_depth: The depth to check at for any upcoming backfill points. + limit: The max number of events to request from the remote federated server. + processing_start_time: The time when `maybe_backfill` started + processing. Only used for timing. + """ backwards_extremities = [ _BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY) for event_id, depth in await self.store.get_oldest_event_ids_with_depth_in_room( From 15081ea1d8eb7f051e5c6d2340e11afb95e769ef Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 16 Aug 2022 15:23:37 -0500 Subject: [PATCH 4/4] Comment why timers added See https://github.com/matrix-org/synapse/pull/13535#discussion_r946488539 --- synapse/handlers/federation.py | 5 ++++- synapse/handlers/federation_event.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 8c80c182474f..7c7018023356 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -80,7 +80,7 @@ logger = logging.getLogger(__name__) - +# Added to debug performance and track progress on optimizations backfill_processing_before_timer = Histogram( "synapse_federation_backfill_processing_before_time_seconds", "sec", @@ -217,7 +217,10 @@ async def maybe_backfill( return. This is used as part of the heuristic to decide if we should back paginate. """ + # Starting the processing time here so we can include the room backfill + # linearizer lock queue in the timing processing_start_time = self.clock.time_msec() + async with self._room_backfill.queue(room_id): return await self._maybe_backfill_inner( room_id, diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 986f2b61e57d..8eed0fe076ed 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -92,6 +92,7 @@ "Events received over federation that we marked as soft_failed", ) +# Added to debug performance and track progress on optimizations backfill_processing_after_timer = Histogram( "synapse_federation_backfill_processing_after_time_seconds", "sec",