From ebd21ae87d23ed64aaa43d79ee5c7d48c5fb5ff2 Mon Sep 17 00:00:00 2001 From: Eric Oestrich Date: Fri, 12 Jan 2024 15:47:57 -0500 Subject: [PATCH] Remove the reconnect timer in DeviceLink The connection between a DeviceLink and DeviceChannel is unreliable when there is a horde of devices trying to connect. A too large percentage of devices can think they have a DeviceLink but it terminated and didn't terminate the DeviceChannel. If we terminate faster, this may reduce the number of devices in this state. --- lib/nerves_hub/devices/device_link.ex | 21 +++---------------- test/nerves_hub/devices/device_link_test.exs | 17 +++------------ .../channels/device_channel_test.exs | 6 ------ 3 files changed, 6 insertions(+), 38 deletions(-) diff --git a/lib/nerves_hub/devices/device_link.ex b/lib/nerves_hub/devices/device_link.ex index daebb2298..b135b6841 100644 --- a/lib/nerves_hub/devices/device_link.ex +++ b/lib/nerves_hub/devices/device_link.ex @@ -26,7 +26,6 @@ defmodule NervesHub.Devices.DeviceLink do :device, :penalty_timer, :push_cb, - :reconnect_timer, :reference_id, :transport_pid, :transport_ref, @@ -169,13 +168,10 @@ defmodule NervesHub.Devices.DeviceLink do @impl GenServer def handle_call(:disconnect, _from, state) do - {:reply, :ok, do_disconnect(state)} + {:stop, :normal, :ok, do_disconnect(state)} end def handle_call({:connect, push_cb, params, monitor, _ctx}, _from, %{device: device} = state) do - # Cancel any pending reconnect timer before we get too busy doing work - _ = if state.reconnect_timer, do: Process.cancel_timer(state.reconnect_timer) - with {:ok, device} <- update_metadata(device, params), {:ok, device} <- Devices.device_connected(device) do state = %{state | device_api_version: Map.get(params, "device_api_version", "1.0.0")} @@ -284,7 +280,6 @@ defmodule NervesHub.Devices.DeviceLink do state | device: device, push_cb: push_cb, - reconnect_timer: nil, update_started?: push_update? } |> maybe_start_penalty_timer() @@ -360,7 +355,7 @@ defmodule NervesHub.Devices.DeviceLink do @impl GenServer def handle_info({:DOWN, transport_ref, :process, _pid, _reason}, state) do if state.transport_ref == transport_ref do - {:noreply, do_disconnect(state)} + {:stop, :normal, do_disconnect(state)} else # TCP sockets have longer timeouts. There is a chance the old socket # was still around when the new one started which could result in @@ -530,10 +525,6 @@ defmodule NervesHub.Devices.DeviceLink do {:noreply, state} end - def handle_info(:timeout_reconnect, state) do - {:stop, :normal, state} - end - def handle_info(msg, state) do # Ignore unhandled messages so that it doesn't crash the link process # preventing cascading problems. @@ -647,12 +638,6 @@ defmodule NervesHub.Devices.DeviceLink do Registry.unregister(NervesHub.Devices, device.id) Tracker.offline(device) - # Give the transport 3 seconds to reconnect to handle cases - # of a socket flapping quickly or something which can be costly - # when doing all the DB lookups on connect in quick succession - _ = if state.reconnect_timer, do: Process.cancel_timer(state.reconnect_timer) - ref = Process.send_after(self(), :timeout_reconnect, :timer.seconds(3)) - - %{state | device: device, reconnect_timer: ref, transport_pid: nil, transport_ref: nil} + %{state | device: device, transport_pid: nil, transport_ref: nil} end end diff --git a/test/nerves_hub/devices/device_link_test.exs b/test/nerves_hub/devices/device_link_test.exs index 5b85a60fd..72a7af16f 100644 --- a/test/nerves_hub/devices/device_link_test.exs +++ b/test/nerves_hub/devices/device_link_test.exs @@ -124,8 +124,7 @@ defmodule NervesHub.DeviceLinkTest do assert Process.exit(monitor_pid, :kill) refute Process.alive?(monitor_pid) - # Timer for reconnect is started - assert :sys.get_state(context.link).reconnect_timer + :timer.sleep(50) # audits the disconnect assert al = @@ -139,7 +138,7 @@ defmodule NervesHub.DeviceLinkTest do assert [] = Registry.lookup(Devices, context.device.id) end - test "calling disconnect/1 on a connected link adds audit log, removes presence, and starts reconnect timer", + test "calling disconnect/1 on a connected link adds audit log, removes presence", %{device: device, link: link} do push_cb = fn _, _ -> :ok end assert {:ok, ^link} = DeviceLink.connect(link, push_cb, %{}) @@ -151,13 +150,10 @@ defmodule NervesHub.DeviceLinkTest do refute Tracker.online?(device) assert [] = Registry.lookup(Devices, device.id) - - # Timer for reconnect is started - assert :sys.get_state(link).reconnect_timer end end - test "disconnect/1 adds audit log, removes presence, and starts reconnect timer", context do + test "disconnect/1 adds audit log, removes presence", context do %{device: device, link: link} = create_device(context) |> start_device_link() assert :ok = DeviceLink.disconnect(link) @@ -167,9 +163,6 @@ defmodule NervesHub.DeviceLinkTest do refute Tracker.online?(device) assert [] = Registry.lookup(Devices, device.id) - - # Timer for reconnect is started - assert :sys.get_state(link).reconnect_timer end describe "recv/3 events" do @@ -487,10 +480,6 @@ defmodule NervesHub.DeviceLinkTest do end end - test "reconnect_timer expiration closes the link" do - assert {:stop, :normal, %{}} = DeviceLink.handle_info(:timeout_reconnect, %{}) - end - test "ignores unknown messages" do log = ExUnit.CaptureLog.capture_log(fn -> diff --git a/test/nerves_hub_web/channels/device_channel_test.exs b/test/nerves_hub_web/channels/device_channel_test.exs index 2c1f79a5b..03ae22c91 100644 --- a/test/nerves_hub_web/channels/device_channel_test.exs +++ b/test/nerves_hub_web/channels/device_channel_test.exs @@ -32,11 +32,5 @@ defmodule NervesHubWeb.DeviceChannelTest do push_cb = :sys.get_state(socket.assigns.device_link_pid).push_cb push_cb.("howdy", %{}) assert_push("howdy", %{}) - - # Socket close starts DeviceLink reconnect timer - link = socket.assigns.device_link_pid - Process.unlink(socket.channel_pid) - close(socket) - assert :sys.get_state(link).reconnect_timer end end