Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Allow for the configuration of max request retries and min/max retry delays in the matrix federation client #15783

Merged
merged 13 commits into from
Jun 21, 2023
1 change: 1 addition & 0 deletions changelog.d/15783.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow for the configuration of max request retries and min/max retry delays in the matrix federation client.
26 changes: 26 additions & 0 deletions docs/usage/configuration/config_documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -1196,6 +1196,32 @@ Example configuration:
allow_device_name_lookup_over_federation: true
```
---
### `federation`

The federation section defines some sub-options related to federation.

The following options are related to configuring timeout and retry logic for one request,
independently of the others.
Short retry algorithm is used when something or someone will wait for the request to have an
answer, while long retry is used for requests that happen in the background,
like sending a federation transaction.

* `client_timeout`: timeout for the federation requests. Default to 60s.
* `max_short_retry_delay`: maximum delay to be used for the short retry algo. Default to 2s.
* `max_long_retry_delay`: maximum delay to be used for the short retry algo. Default to 60s.
* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.

Example configuration:
```yaml
federation:
client_timeout: 180s
max_short_retry_delay: 7s
max_long_retry_delay: 100s
max_short_retries: 5
max_long_retries: 20
```
---
## Caching

Options related to caching.
Expand Down
16 changes: 16 additions & 0 deletions synapse/config/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class FederationConfig(Config):
section = "federation"

def read_config(self, config: JsonDict, **kwargs: Any) -> None:
federation_config = config.setdefault("federation", {})

# FIXME: federation_domain_whitelist needs sytests
self.federation_domain_whitelist: Optional[dict] = None
federation_domain_whitelist = config.get("federation_domain_whitelist", None)
Expand Down Expand Up @@ -49,5 +51,19 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None:
"allow_device_name_lookup_over_federation", False
)

# Allow for the configuration of timeout, max request retries
# and min/max retry delays in the matrix federation client.
self.client_timeout_ms = Config.parse_duration(
federation_config.get("client_timeout", "60s")
)
self.max_long_retry_delay_ms = Config.parse_duration(
federation_config.get("max_long_retry_delay", "60s")
)
self.max_short_retry_delay_ms = Config.parse_duration(
federation_config.get("max_short_retry_delay", "2s")
)
self.max_long_retries = federation_config.get("max_long_retries", 10)
self.max_short_retries = federation_config.get("max_short_retries", 3)


_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
59 changes: 38 additions & 21 deletions synapse/http/matrixfederationclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,6 @@
)


MAX_LONG_RETRIES = 10
MAX_SHORT_RETRIES = 3
MAXINT = sys.maxsize


Expand Down Expand Up @@ -406,7 +404,16 @@ def __init__(
self.clock = hs.get_clock()
self._store = hs.get_datastores().main
self.version_string_bytes = hs.version_string.encode("ascii")
self.default_timeout = 60
self.default_timeout_seconds = hs.config.federation.client_timeout_ms / 1000

self.max_long_retry_delay_seconds = (
hs.config.federation.max_long_retry_delay_ms / 1000
)
self.max_short_retry_delay_seconds = (
hs.config.federation.max_short_retry_delay_ms / 1000
)
self.max_long_retries = hs.config.federation.max_long_retries
self.max_short_retries = hs.config.federation.max_short_retries

self._cooperator = Cooperator(scheduler=_make_scheduler(self.reactor))

Expand Down Expand Up @@ -535,10 +542,10 @@ async def _send_request(
logger.exception(f"Invalid destination: {request.destination}.")
raise FederationDeniedError(request.destination)

if timeout:
if timeout is not None:
_sec_timeout = timeout / 1000
else:
_sec_timeout = self.default_timeout
_sec_timeout = self.default_timeout_seconds

if (
self.hs.config.federation.federation_domain_whitelist is not None
Expand Down Expand Up @@ -583,9 +590,9 @@ async def _send_request(
# XXX: Would be much nicer to retry only at the transaction-layer
# (once we have reliable transactions in place)
if long_retries:
retries_left = MAX_LONG_RETRIES
retries_left = self.max_long_retries
else:
retries_left = MAX_SHORT_RETRIES
retries_left = self.max_short_retries

url_bytes = request.uri
url_str = url_bytes.decode("ascii")
Expand Down Expand Up @@ -730,24 +737,34 @@ async def _send_request(

if retries_left and not timeout:
if long_retries:
delay = 4 ** (MAX_LONG_RETRIES + 1 - retries_left)
delay = min(delay, 60)
delay *= random.uniform(0.8, 1.4)
delay_seconds = 4 ** (
self.max_long_retries + 1 - retries_left
)
delay_seconds = min(
delay_seconds, self.max_long_retry_delay_seconds
)
delay_seconds *= random.uniform(0.8, 1.4)
else:
delay = 0.5 * 2 ** (MAX_SHORT_RETRIES - retries_left)
delay = min(delay, 2)
delay *= random.uniform(0.8, 1.4)
delay_seconds = 0.5 * 2 ** (
self.max_short_retries - retries_left
)
delay_seconds = min(
delay_seconds, self.max_short_retry_delay_seconds
)
delay_seconds *= random.uniform(0.8, 1.4)

logger.debug(
"{%s} [%s] Waiting %ss before re-sending...",
request.txn_id,
request.destination,
delay,
delay_seconds,
)

# Sleep for the calculated delay, or wake up immediately
# if we get notified that the server is back up.
await self._sleeper.sleep(request.destination, delay * 1000)
await self._sleeper.sleep(
request.destination, delay_seconds * 1000
)
retries_left -= 1
else:
raise
Expand Down Expand Up @@ -946,7 +963,7 @@ async def put_json(
if timeout is not None:
_sec_timeout = timeout / 1000
else:
_sec_timeout = self.default_timeout
_sec_timeout = self.default_timeout_seconds

if parser is None:
parser = cast(ByteParser[T], JsonParser())
Expand Down Expand Up @@ -1024,10 +1041,10 @@ async def post_json(
ignore_backoff=ignore_backoff,
)

if timeout:
if timeout is not None:
_sec_timeout = timeout / 1000
else:
_sec_timeout = self.default_timeout
_sec_timeout = self.default_timeout_seconds

body = await _handle_response(
self.reactor, _sec_timeout, request, response, start_ms, parser=JsonParser()
Expand Down Expand Up @@ -1135,7 +1152,7 @@ async def get_json(
if timeout is not None:
_sec_timeout = timeout / 1000
else:
_sec_timeout = self.default_timeout
_sec_timeout = self.default_timeout_seconds

if parser is None:
parser = cast(ByteParser[T], JsonParser())
Expand Down Expand Up @@ -1211,7 +1228,7 @@ async def delete_json(
if timeout is not None:
_sec_timeout = timeout / 1000
else:
_sec_timeout = self.default_timeout
_sec_timeout = self.default_timeout_seconds

body = await _handle_response(
self.reactor, _sec_timeout, request, response, start_ms, parser=JsonParser()
Expand Down Expand Up @@ -1263,7 +1280,7 @@ async def get_file(

try:
d = read_body_with_max_size(response, output_stream, max_size)
d.addTimeout(self.default_timeout, self.reactor)
d.addTimeout(self.default_timeout_seconds, self.reactor)
length = await make_deferred_yieldable(d)
except BodyExceededMaxSize:
msg = "Requested file is too large > %r bytes" % (max_size,)
Expand Down
20 changes: 19 additions & 1 deletion tests/http/test_matrixfederationclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from synapse.util import Clock

from tests.server import FakeTransport
from tests.unittest import HomeserverTestCase
from tests.unittest import HomeserverTestCase, override_config


def check_logcontext(context: LoggingContextOrSentinel) -> None:
Expand Down Expand Up @@ -640,3 +640,21 @@ def test_build_auth_headers_rejects_falsey_destinations(self) -> None:
self.cl.build_auth_headers(
b"", b"GET", b"https://example.com", destination_is=b""
)

@override_config(
{
"federation": {
"client_timeout": "180s",
"max_long_retry_delay": "100s",
"max_short_retry_delay": "7s",
"max_long_retries": 20,
"max_short_retries": 5,
}
}
)
def test_configurable_retry_and_delay_values(self) -> None:
self.assertEqual(self.cl.default_timeout_seconds, 180)
self.assertEqual(self.cl.max_long_retry_delay_seconds, 100)
self.assertEqual(self.cl.max_short_retry_delay_seconds, 7)
self.assertEqual(self.cl.max_long_retries, 20)
self.assertEqual(self.cl.max_short_retries, 5)