diff --git a/edb/server/pgcon/pgcon.pyx b/edb/server/pgcon/pgcon.pyx index 202c3b0e1db..c1f162f69c6 100644 --- a/edb/server/pgcon/pgcon.pyx +++ b/edb/server/pgcon/pgcon.pyx @@ -130,13 +130,31 @@ def _set_tcp_keepalive(transport): # against AWS RDS. We are keeping the TCP keepalive for generic # Postgres connections as the kernel overhead is considered low, and # in certain cases it does save us some reconnection time. + # + # In case of high-availability Postgres, TCP keepalive is necessary to + # disconnect from a failing master node, if no other failover information + # is available. sock = transport.get_extra_info('socket') sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + + # TCP_KEEPIDLE: the time (in seconds) the connection needs to remain idle + # before TCP starts sending keepalive probes. This is socket.TCP_KEEPIDLE + # on Linux, and socket.TCP_KEEPALIVE on macOS from Python 3.10. if hasattr(socket, 'TCP_KEEPIDLE'): sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, TCP_KEEPIDLE) + if hasattr(socket, 'TCP_KEEPALIVE'): + sock.setsockopt(socket.IPPROTO_TCP, + socket.TCP_KEEPALIVE, TCP_KEEPIDLE) + + # TCP_KEEPINTVL: The time (in seconds) between individual keepalive probes. + if hasattr(socket, 'TCP_KEEPINTVL'): sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, TCP_KEEPINTVL) + + # TCP_KEEPCNT: The maximum number of keepalive probes TCP should send + # before dropping the connection. + if hasattr(socket, 'TCP_KEEPCNT'): sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, TCP_KEEPCNT) diff --git a/edb/server/server.py b/edb/server/server.py index 47d0bcb6fe8..a60dc224c72 100644 --- a/edb/server/server.py +++ b/edb/server/server.py @@ -904,7 +904,7 @@ async def _reconnect_sys_pgcon(self): try: conn = await self._pg_connect(defines.EDGEDB_SYSTEM_DB) break - except ConnectionError: + except (ConnectionError, TimeoutError): # Keep retrying as far as: # 1. The EdgeDB server is still serving, # 2. We still cannot connect to the Postgres cluster, or