Skip to content

Commit

Permalink
Remove the reconnect timer in DeviceLink
Browse files Browse the repository at this point in the history
The connection between a DeviceLink and DeviceChannel is unreliable when
there is a horde of devices trying to connect. A too large percentage of
devices can think they have a DeviceLink but it terminated and didn't
terminate the DeviceChannel. If we terminate faster, this may reduce the
number of devices in this state.
  • Loading branch information
oestrich committed Jan 16, 2024
1 parent 600e1f4 commit ebd21ae
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 38 deletions.
21 changes: 3 additions & 18 deletions lib/nerves_hub/devices/device_link.ex
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ defmodule NervesHub.Devices.DeviceLink do
:device,
:penalty_timer,
:push_cb,
:reconnect_timer,
:reference_id,
:transport_pid,
:transport_ref,
Expand Down Expand Up @@ -169,13 +168,10 @@ defmodule NervesHub.Devices.DeviceLink do

@impl GenServer
def handle_call(:disconnect, _from, state) do
{:reply, :ok, do_disconnect(state)}
{:stop, :normal, :ok, do_disconnect(state)}
end

def handle_call({:connect, push_cb, params, monitor, _ctx}, _from, %{device: device} = state) do
# Cancel any pending reconnect timer before we get too busy doing work
_ = if state.reconnect_timer, do: Process.cancel_timer(state.reconnect_timer)

with {:ok, device} <- update_metadata(device, params),
{:ok, device} <- Devices.device_connected(device) do
state = %{state | device_api_version: Map.get(params, "device_api_version", "1.0.0")}
Expand Down Expand Up @@ -284,7 +280,6 @@ defmodule NervesHub.Devices.DeviceLink do
state
| device: device,
push_cb: push_cb,
reconnect_timer: nil,
update_started?: push_update?
}
|> maybe_start_penalty_timer()
Expand Down Expand Up @@ -360,7 +355,7 @@ defmodule NervesHub.Devices.DeviceLink do
@impl GenServer
def handle_info({:DOWN, transport_ref, :process, _pid, _reason}, state) do
if state.transport_ref == transport_ref do
{:noreply, do_disconnect(state)}
{:stop, :normal, do_disconnect(state)}
else
# TCP sockets have longer timeouts. There is a chance the old socket
# was still around when the new one started which could result in
Expand Down Expand Up @@ -530,10 +525,6 @@ defmodule NervesHub.Devices.DeviceLink do
{:noreply, state}
end

def handle_info(:timeout_reconnect, state) do
{:stop, :normal, state}
end

def handle_info(msg, state) do
# Ignore unhandled messages so that it doesn't crash the link process
# preventing cascading problems.
Expand Down Expand Up @@ -647,12 +638,6 @@ defmodule NervesHub.Devices.DeviceLink do
Registry.unregister(NervesHub.Devices, device.id)
Tracker.offline(device)

# Give the transport 3 seconds to reconnect to handle cases
# of a socket flapping quickly or something which can be costly
# when doing all the DB lookups on connect in quick succession
_ = if state.reconnect_timer, do: Process.cancel_timer(state.reconnect_timer)
ref = Process.send_after(self(), :timeout_reconnect, :timer.seconds(3))

%{state | device: device, reconnect_timer: ref, transport_pid: nil, transport_ref: nil}
%{state | device: device, transport_pid: nil, transport_ref: nil}
end
end
17 changes: 3 additions & 14 deletions test/nerves_hub/devices/device_link_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,7 @@ defmodule NervesHub.DeviceLinkTest do
assert Process.exit(monitor_pid, :kill)
refute Process.alive?(monitor_pid)

# Timer for reconnect is started
assert :sys.get_state(context.link).reconnect_timer
:timer.sleep(50)

# audits the disconnect
assert al =
Expand All @@ -139,7 +138,7 @@ defmodule NervesHub.DeviceLinkTest do
assert [] = Registry.lookup(Devices, context.device.id)
end

test "calling disconnect/1 on a connected link adds audit log, removes presence, and starts reconnect timer",
test "calling disconnect/1 on a connected link adds audit log, removes presence",
%{device: device, link: link} do
push_cb = fn _, _ -> :ok end
assert {:ok, ^link} = DeviceLink.connect(link, push_cb, %{})
Expand All @@ -151,13 +150,10 @@ defmodule NervesHub.DeviceLinkTest do

refute Tracker.online?(device)
assert [] = Registry.lookup(Devices, device.id)

# Timer for reconnect is started
assert :sys.get_state(link).reconnect_timer
end
end

test "disconnect/1 adds audit log, removes presence, and starts reconnect timer", context do
test "disconnect/1 adds audit log, removes presence", context do
%{device: device, link: link} = create_device(context) |> start_device_link()
assert :ok = DeviceLink.disconnect(link)

Expand All @@ -167,9 +163,6 @@ defmodule NervesHub.DeviceLinkTest do

refute Tracker.online?(device)
assert [] = Registry.lookup(Devices, device.id)

# Timer for reconnect is started
assert :sys.get_state(link).reconnect_timer
end

describe "recv/3 events" do
Expand Down Expand Up @@ -487,10 +480,6 @@ defmodule NervesHub.DeviceLinkTest do
end
end

test "reconnect_timer expiration closes the link" do
assert {:stop, :normal, %{}} = DeviceLink.handle_info(:timeout_reconnect, %{})
end

test "ignores unknown messages" do
log =
ExUnit.CaptureLog.capture_log(fn ->
Expand Down
6 changes: 0 additions & 6 deletions test/nerves_hub_web/channels/device_channel_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,5 @@ defmodule NervesHubWeb.DeviceChannelTest do
push_cb = :sys.get_state(socket.assigns.device_link_pid).push_cb
push_cb.("howdy", %{})
assert_push("howdy", %{})

# Socket close starts DeviceLink reconnect timer
link = socket.assigns.device_link_pid
Process.unlink(socket.channel_pid)
close(socket)
assert :sys.get_state(link).reconnect_timer
end
end

0 comments on commit ebd21ae

Please sign in to comment.