From f587d338bc4a05a6094207e46cddb6d4006ec8b0 Mon Sep 17 00:00:00 2001 From: Fantix King Date: Fri, 27 Aug 2021 10:20:39 -0400 Subject: [PATCH] Fix for AWS RDS multi-az failover (#2866) In case of RDS failover (based on the Reboot With Failover test), TCP keepalive is needed to detect unhealthy connections to the failing master, and new connect attempts may raise TimeoutError before the new master is in position. This fixes the TCP keepalive feature on macOS and requires Python 3.10 refs bpo-34932. Refs #2293. --- edb/server/pgcon/pgcon.pyx | 18 ++++++++++++++++++ edb/server/server.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/edb/server/pgcon/pgcon.pyx b/edb/server/pgcon/pgcon.pyx index 202c3b0e1db..c1f162f69c6 100644 --- a/edb/server/pgcon/pgcon.pyx +++ b/edb/server/pgcon/pgcon.pyx @@ -130,13 +130,31 @@ def _set_tcp_keepalive(transport): # against AWS RDS. We are keeping the TCP keepalive for generic # Postgres connections as the kernel overhead is considered low, and # in certain cases it does save us some reconnection time. + # + # In case of high-availability Postgres, TCP keepalive is necessary to + # disconnect from a failing master node, if no other failover information + # is available. sock = transport.get_extra_info('socket') sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + + # TCP_KEEPIDLE: the time (in seconds) the connection needs to remain idle + # before TCP starts sending keepalive probes. This is socket.TCP_KEEPIDLE + # on Linux, and socket.TCP_KEEPALIVE on macOS from Python 3.10. if hasattr(socket, 'TCP_KEEPIDLE'): sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, TCP_KEEPIDLE) + if hasattr(socket, 'TCP_KEEPALIVE'): + sock.setsockopt(socket.IPPROTO_TCP, + socket.TCP_KEEPALIVE, TCP_KEEPIDLE) + + # TCP_KEEPINTVL: The time (in seconds) between individual keepalive probes. + if hasattr(socket, 'TCP_KEEPINTVL'): sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, TCP_KEEPINTVL) + + # TCP_KEEPCNT: The maximum number of keepalive probes TCP should send + # before dropping the connection. + if hasattr(socket, 'TCP_KEEPCNT'): sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, TCP_KEEPCNT) diff --git a/edb/server/server.py b/edb/server/server.py index 47d0bcb6fe8..a60dc224c72 100644 --- a/edb/server/server.py +++ b/edb/server/server.py @@ -904,7 +904,7 @@ async def _reconnect_sys_pgcon(self): try: conn = await self._pg_connect(defines.EDGEDB_SYSTEM_DB) break - except ConnectionError: + except (ConnectionError, TimeoutError): # Keep retrying as far as: # 1. The EdgeDB server is still serving, # 2. We still cannot connect to the Postgres cluster, or