Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Faster room joins: Try other destinations when resyncing the state of a partial-state room #12812

Merged
Merged
1 change: 1 addition & 0 deletions changelog.d/12812.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Try other homeservers when re-syncing state for rooms with partial state.
2 changes: 1 addition & 1 deletion synapse/federation/federation_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ async def get_room_state_ids(
if not isinstance(state_event_ids, list) or not isinstance(
auth_event_ids, list
):
raise Exception("invalid response from /state_ids")
raise InvalidResponseError("invalid response from /state_ids")
richvdh marked this conversation as resolved.
Show resolved Hide resolved

return state_event_ids, auth_event_ids

Expand Down
70 changes: 62 additions & 8 deletions synapse/handlers/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,16 @@
import logging
from enum import Enum
from http import HTTPStatus
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
from typing import (
TYPE_CHECKING,
Collection,
Dict,
Iterable,
List,
Optional,
Tuple,
Union,
)

import attr
from signedjson.key import decode_verify_key_bytes
Expand All @@ -34,6 +43,7 @@
CodeMessageException,
Codes,
FederationDeniedError,
FederationError,
HttpResponseException,
NotFoundError,
RequestSendFailed,
Expand Down Expand Up @@ -545,6 +555,7 @@ async def do_invite_join(
desc="sync_partial_state_room",
func=self._sync_partial_state_room,
destination=origin,
destinations=ret.servers_in_room,
room_id=room_id,
)

Expand Down Expand Up @@ -1443,13 +1454,16 @@ async def get_room_complexity(

async def _sync_partial_state_room(
self,
destination: str,
destination: Optional[str],
destinations: Collection[str],
richvdh marked this conversation as resolved.
Show resolved Hide resolved
room_id: str,
) -> None:
"""Background process to resync the state of a partial-state room

Args:
destination: homeserver to pull the state from
destination: the initial homeserver to pull the state from
destinations: other homeservers to try to pull the state from, if
`destination` is unavailable
room_id: room to be resynced
"""

Expand All @@ -1460,9 +1474,19 @@ async def _sync_partial_state_room(
# TODO(faster_joins): what happens if we leave the room during a resync? if we
# really leave, that might mean we have difficulty getting the room state over
# federation.
#
# TODO(faster_joins): try other destinations if the one we have fails

# Make an infinite iterator of destinations to try. Once we find a working
# destination, we'll stick with it until it flakes.
if destination is not None:
# Move `destination` to the front of the list.
destinations = list(destinations)
if destination in destinations:
destinations.remove(destination)
destinations = [destination] + destinations
destination_iter = itertools.cycle(destinations)
richvdh marked this conversation as resolved.
Show resolved Hide resolved

# `destination` is now the current remote homeserver we're pulling from.
destination = next(destination_iter)
logger.info("Syncing state for room %s via %s", room_id, destination)

# we work through the queue in order of increasing stream ordering.
Expand Down Expand Up @@ -1498,6 +1522,36 @@ async def _sync_partial_state_room(
allow_rejected=True,
)
for event in events:
await self._federation_event_handler.update_state_for_partial_state_event(
destination, event
)
for attempt in range(len(destinations)):
richvdh marked this conversation as resolved.
Show resolved Hide resolved
try:
await self._federation_event_handler.update_state_for_partial_state_event(
destination, event
)
break
except FederationError as e:
if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up.
logger.error(
"Failed to get state for %s at %s from %s because %s, "
"giving up!",
room_id,
event,
destination,
e,
)
raise
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might not be the best thing to do. We might reach here in the event of a temporary network outage. Then the resync would be stuck until the next restart.

Retrying all destinations indefinitely is also not the best thing to do, eg. if we can successfully reach all homeservers and they all claim they don't have the data we want.

Nonetheless, this PR is an improvement and doesn't introduce any failure modes that didn't exist before.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed. This is one of my "big unanswered questions" for faster joins. Let's leave it like this for now.

could you add a # TODO(faster_joins) alluding to this?


# Try the next remote server.
logger.info(
"Failed to get state for %s at %s from %s because %s",
room_id,
event,
destination,
e,
)
destination = next(destination_iter)
logger.info(
"Syncing state for room %s via %s instead",
room_id,
destination,
)
7 changes: 7 additions & 0 deletions synapse/handlers/federation_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,9 @@ async def update_state_for_partial_state_event(
Args:
destination: server to request full state from
event: partial-state event to be de-partial-stated

Raises:
FederationError if we fail to request state from the remote server.
"""
logger.info("Updating state for %s", event.event_id)
with nested_logging_context(suffix=event.event_id):
Expand Down Expand Up @@ -792,6 +795,10 @@ async def _resolve_state_at_missing_prevs(
Returns:
if we already had all the prev events, `None`. Otherwise, returns a list of
the events in the state at `event`.

Raises:
FederationError if we fail to get the state from the remote server after any
missing `prev_event`s.
"""
room_id = event.room_id
event_id = event.event_id
Expand Down