From ff14b77725cb763969d4f0e8d764ff1c80ce32b7 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 1 Sep 2023 13:49:20 +0100 Subject: [PATCH 1/5] Don't reset retry timers on error --- .../federation/sender/per_destination_queue.py | 9 ++++++++- synapse/util/retryutils.py | 18 ++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index 31c5c2b7dee2..84e51ec4d9a4 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -314,7 +314,14 @@ async def _transaction_transmission_loop(self) -> None: # This will throw if we wouldn't retry. We do this here so we fail # quickly, but we will later check this again in the http client, # hence why we throw the result away. - await get_retry_limiter(self._destination, self._clock, self._store) + await get_retry_limiter( + self._destination, + self._clock, + self._store, + # Sending a transaction should always succeed, if it doesn't + # then something is wrong and we should backoff. + backoff_on_all_error_codes=True, + ) if self._catching_up: # we potentially need to catch-up first diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 27e9fc976c10..9d2065372c42 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -128,6 +128,7 @@ def __init__( backoff_on_failure: bool = True, notifier: Optional["Notifier"] = None, replication_client: Optional["ReplicationCommandHandler"] = None, + backoff_on_all_error_codes: bool = False, ): """Marks the destination as "down" if an exception is thrown in the context, except for CodeMessageException with code < 500. @@ -147,6 +148,9 @@ def __init__( backoff_on_failure: set to False if we should not increase the retry interval on a failure. + + backoff_on_all_error_codes: Whether we should back off on any + error code. """ self.clock = clock self.store = store @@ -156,6 +160,7 @@ def __init__( self.retry_interval = retry_interval self.backoff_on_404 = backoff_on_404 self.backoff_on_failure = backoff_on_failure + self.backoff_on_all_error_codes = backoff_on_all_error_codes self.notifier = notifier self.replication_client = replication_client @@ -179,6 +184,7 @@ def __exit__( exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: + success = exc_type is None valid_err_code = False if exc_type is None: valid_err_code = True @@ -195,7 +201,9 @@ def __exit__( # won't accept our requests for at least a while. # 429 is us being aggressively rate limited, so lets rate limit # ourselves. - if exc_val.code == 404 and self.backoff_on_404: + if self.backoff_on_all_error_codes: + valid_err_code = False + elif exc_val.code == 404 and self.backoff_on_404: valid_err_code = False elif exc_val.code in (401, 429): valid_err_code = False @@ -204,7 +212,7 @@ def __exit__( else: valid_err_code = False - if valid_err_code: + if success: # We connected successfully. if not self.retry_interval: return @@ -215,6 +223,12 @@ def __exit__( self.failure_ts = None retry_last_ts = 0 self.retry_interval = 0 + elif valid_err_code: + # We got a potentially valid error code back. We don't reset the + # timers though, as the other side might actually be down anyway + # (e.g. some deprovisioned servers will always return a 404 or 403, + # and we don't want to keep resetting the retry timers for them). + return elif not self.backoff_on_failure: return else: From 87ac2edbd8c5cb5a5641ffcb696329630895fc22 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 1 Sep 2023 13:56:37 +0100 Subject: [PATCH 2/5] Fix up --- synapse/federation/sender/per_destination_queue.py | 9 +-------- synapse/federation/transport/client.py | 4 +++- synapse/http/matrixfederationclient.py | 3 +++ 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index 84e51ec4d9a4..31c5c2b7dee2 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -314,14 +314,7 @@ async def _transaction_transmission_loop(self) -> None: # This will throw if we wouldn't retry. We do this here so we fail # quickly, but we will later check this again in the http client, # hence why we throw the result away. - await get_retry_limiter( - self._destination, - self._clock, - self._store, - # Sending a transaction should always succeed, if it doesn't - # then something is wrong and we should backoff. - backoff_on_all_error_codes=True, - ) + await get_retry_limiter(self._destination, self._clock, self._store) if self._catching_up: # we potentially need to catch-up first diff --git a/synapse/federation/transport/client.py b/synapse/federation/transport/client.py index 5ce3f345cbeb..b5e4b2680e14 100644 --- a/synapse/federation/transport/client.py +++ b/synapse/federation/transport/client.py @@ -249,8 +249,10 @@ async def send_transaction( data=json_data, json_data_callback=json_data_callback, long_retries=True, - backoff_on_404=True, # If we get a 404 the other side has gone try_trailing_slash_on_400=True, + # Sending a transaction should always succeed, if it doesn't + # then something is wrong and we should backoff. + backoff_on_all_error_codes=True, ) async def make_query( diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index 11342ccac8a3..be6733fa8849 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -512,6 +512,7 @@ async def _send_request( long_retries: bool = False, ignore_backoff: bool = False, backoff_on_404: bool = False, + backoff_on_all_error_codes: bool = False, ) -> IResponse: """ Sends a request to the given server. @@ -552,6 +553,7 @@ async def _send_request( and try the request anyway. backoff_on_404: Back off if we get a 404 + backoff_on_all_error_codes: Back off if we get any error response Returns: Resolves with the HTTP response object on success. @@ -594,6 +596,7 @@ async def _send_request( ignore_backoff=ignore_backoff, notifier=self.hs.get_notifier(), replication_client=self.hs.get_replication_command_handler(), + backoff_on_all_error_codes=backoff_on_all_error_codes, ) method_bytes = request.method.encode("ascii") From 99d6e1c789e716e0963cd41642be964274ce2fca Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 1 Sep 2023 13:59:20 +0100 Subject: [PATCH 3/5] Fix up --- synapse/http/matrixfederationclient.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index be6733fa8849..08c7fc1631d8 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -892,6 +892,7 @@ async def put_json( backoff_on_404: bool = False, try_trailing_slash_on_400: bool = False, parser: Literal[None] = None, + backoff_on_all_error_codes: bool = False, ) -> JsonDict: ... @@ -909,6 +910,7 @@ async def put_json( backoff_on_404: bool = False, try_trailing_slash_on_400: bool = False, parser: Optional[ByteParser[T]] = None, + backoff_on_all_error_codes: bool = False, ) -> T: ... @@ -925,6 +927,7 @@ async def put_json( backoff_on_404: bool = False, try_trailing_slash_on_400: bool = False, parser: Optional[ByteParser[T]] = None, + backoff_on_all_error_codes: bool = False, ) -> Union[JsonDict, T]: """Sends the specified json data using PUT @@ -960,6 +963,7 @@ async def put_json( enabled. parser: The parser to use to decode the response. Defaults to parsing as JSON. + backoff_on_all_error_codes: Back off if we get any error response Returns: Succeeds when we get a 2xx HTTP response. The @@ -993,6 +997,7 @@ async def put_json( ignore_backoff=ignore_backoff, long_retries=long_retries, timeout=timeout, + backoff_on_all_error_codes=backoff_on_all_error_codes, ) if timeout is not None: From badce67d72fc9e302e27e63622218c1b68aa306d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 1 Sep 2023 14:04:14 +0100 Subject: [PATCH 4/5] Newsfile --- changelog.d/16221.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/16221.bugfix diff --git a/changelog.d/16221.bugfix b/changelog.d/16221.bugfix new file mode 100644 index 000000000000..22678256e4f7 --- /dev/null +++ b/changelog.d/16221.bugfix @@ -0,0 +1 @@ +Fix long-standing bug where we did not correctly back off from servers that had "gone" if they returned 4xx series error codes. From 0b68944050562f5cf480a14ca43c7f5d904ff1e1 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 1 Sep 2023 14:17:26 +0100 Subject: [PATCH 5/5] Fix tests --- tests/handlers/test_typing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/handlers/test_typing.py b/tests/handlers/test_typing.py index 2a295da3a0b7..43c513b15722 100644 --- a/tests/handlers/test_typing.py +++ b/tests/handlers/test_typing.py @@ -251,8 +251,8 @@ def test_started_typing_remote_send(self) -> None: ), json_data_callback=ANY, long_retries=True, - backoff_on_404=True, try_trailing_slash_on_400=True, + backoff_on_all_error_codes=True, ) def test_started_typing_remote_recv(self) -> None: @@ -366,7 +366,7 @@ def test_stopped_typing(self) -> None: ), json_data_callback=ANY, long_retries=True, - backoff_on_404=True, + backoff_on_all_error_codes=True, try_trailing_slash_on_400=True, )