dask · fjetter · Oct 22, 2021 · Nov 4, 2021 · Nov 4, 2021 · gjoseph92
@@ -1,12 +1,14 @@
 import logging
 from collections import deque
+from uuid import uuid4
 
 from tornado import gen, locks
 from tornado.ioloop import IOLoop
 
 import dask
 from dask.utils import parse_timedelta
 
+from .comm import Comm
 from .core import CommClosedError
 
 logger = logging.getLogger(__name__)
@@ -43,6 +45,7 @@ def __init__(self, interval, loop=None, serializers=None):
         self.interval = parse_timedelta(interval, default="ms")
         self.waker = locks.Event()
         self.stopped = locks.Event()
+        self.stopped.set()
         self.please_stop = False
         self.buffer = []
         self.comm = None
@@ -56,12 +59,33 @@ def __init__(self, interval, loop=None, serializers=None):
         self.serializers = serializers
         self._consecutive_failures = 0
 
-    def start(self, comm):
-        self.comm = comm
-        self.loop.add_callback(self._background_send)
+    def start(self, comm: Comm):
+        """
+        Start the BatchedSend by providing an open Comm object.
+
+        Calling this again on an already started BatchedSend will raise a
+        `RuntimeError` if the provided Comm is different to the current one. If
+        the provided Comm is identical this is a noop.
+
+        In case the BatchedSend was already closed, this will use the newly
+        provided Comm to submit any accumulated messages in the buffer.
+        """
+        if self.closed():
+            if comm.closed():
+                raise RuntimeError("Comm already closed.")
-                raise RuntimeError("Comm already closed.")
+                raise RuntimeError(f"Tried to start BatchedSend with an already-closed comm: {comm!r}.")
-                raise RuntimeError("Comm already closed.")
+                raise RuntimeError(f"Tried to start BatchedSend with an already-closed comm: {comm!r}.")
+            self.comm = comm
+            self.please_stop = False
-            self.please_stop = False
+            self.please_stop = False
+            self.stopped.clear()
-            self.please_stop = False
+            self.please_stop = False
+            self.stopped.clear()
+            self.loop.add_callback(self._background_send)
+        elif self.comm is not comm:
+            raise RuntimeError("BatchedSend already started.")
 
     def closed(self):
-        return self.comm and self.comm.closed()
+        """True if the BatchedSend hasn't been started or has been closed
+        already."""
+        if self.comm is None or self.comm.closed():
+            return True
+        else:
+            return False
 
     def __repr__(self):
         if self.closed():
@@ -98,6 +122,8 @@ def _background_send(self):
                 else:
                     self.recent_message_log.append("large-message")
                 self.byte_count += nbytes
+
+                payload.clear()  # lose ref
             except CommClosedError:
                 logger.info("Batched Comm Closed %r", self.comm, exc_info=True)
                 break
@@ -111,28 +137,35 @@ def _background_send(self):
                 logger.exception("Error in batched write")
                 break
             finally:
-                payload = None  # lose ref
+                # If anything failed we should not loose payload. If a new comm
+                # is provided we can still resubmit messages
+                self.buffer = payload + self.buffer
         else:
             # nobreak. We've been gracefully closed.
             self.stopped.set()
             return
 
+        self.stopped.set()
         # If we've reached here, it means `break` was hit above and
         # there was an exception when using `comm`.
         # We can't close gracefully via `.close()` since we can't send messages.
         # So we just abort.
-        # This means that any messages in our buffer our lost.
         # To propagate exceptions, we rely on subsequent `BatchedSend.send`
         # calls to raise CommClosedErrors.
-        self.stopped.set()
-        self.abort()
+
+        if self.comm:
+            self.comm.abort()
+        yield self.close()
 
     def send(self, *msgs):
         """Schedule a message for sending to the other side
 
-        This completes quickly and synchronously
+        This completes quickly and synchronously.
+
+        If the BatchedSend or Comm is already closed, this raises a
+        CommClosedError and does not accept any further messages to the buffer.
         """
-        if self.comm is not None and self.comm.closed():
+        if self.closed():
             raise CommClosedError(f"Comm {self.comm!r} already closed.")
 
         self.message_count += len(msgs)
@@ -143,7 +176,7 @@ def send(self, *msgs):
 
     @gen.coroutine
     def close(self, timeout=None):
-        """Flush existing messages and then close comm
+        """Flush existing messages and then close Comm
 
         If set, raises `tornado.util.TimeoutError` after a timeout.
         """
@@ -153,21 +186,27 @@ def close(self, timeout=None):
         self.waker.set()
         yield self.stopped.wait(timeout=timeout)
         if not self.comm.closed():
+            payload = []
             try:
                 if self.buffer:
                     self.buffer, payload = [], self.buffer
                     yield self.comm.write(
                         payload, serializers=self.serializers, on_error="raise"
                     )
             except CommClosedError:
-                pass
+                # If we're closing and there is an error there is little we
+                # can do about this to recover.
+                logger.error("Lost %i payload messages.", len(payload))
             yield self.comm.close()
 
     def abort(self):
+        """Close the BatchedSend immediately, without waiting for any pending
+        operations to complete. Buffered data will be lost."""
         if self.comm is None:
             return
-        self.please_stop = True
         self.buffer = []
+        self.please_stop = True
         self.waker.set()
         if not self.comm.closed():
             self.comm.abort()
+        self.comm = None
@@ -1,3 +1,5 @@
+import asyncio
+
 import ctypes
 import errno
 import functools
@@ -290,7 +292,15 @@ async def write(self, msg, serializers=None, on_error="message"):
                     stream._total_write_index += each_frame_nbytes
 
             # start writing frames
-            stream.write(b"")
+            await stream.write(b"")
+            # FIXME: How do I test this? Why is the stream closed _sometimes_?
+            # Diving into tornado, so far, I can only confirm that once the
+            # write future has been awaited, the entire buffer has been written
+            # to the socket. Not sure if one loop iteration is sufficient in
+            # general or just sufficient for the local tests I've been running
+            await asyncio.sleep(0)
+            if stream.closed():
+                raise StreamClosedError()
         except StreamClosedError as e:
             self.stream = None
             self._closed = True
@@ -333,6 +343,8 @@ def abort(self):
             stream.close()
 
     def closed(self):
+        if self.stream and self.stream.closed():
+            self.abort()
         return self._closed
 
     @property

@@ -5510,7 +5510,7 @@ async def handle_worker(self, comm=None, worker=None):
             await self.handle_stream(comm=comm, extra={"worker": worker})
         finally:
             if worker in self.stream_comms:
-                worker_comm.abort()
+                await worker_comm.close()
                 await self.remove_worker(address=worker)
 
     def add_plugin(

@@ -43,9 +43,11 @@ async def test_BatchedSend():
         comm = await connect(e.address)
 
         b = BatchedSend(interval=10)
+        assert "<BatchedSend: closed>" == str(b)
+        assert "<BatchedSend: closed>" == repr(b)
+        b.start(comm)
         assert str(len(b.buffer)) in str(b)
         assert str(len(b.buffer)) in repr(b)
-        b.start(comm)
 
         await asyncio.sleep(0.020)
 
@@ -79,6 +81,125 @@ async def test_send_before_start():
         assert result == ("hello", "world")
 
 
+@pytest.mark.asyncio
+async def test_closed_if_not_started():
+    async with EchoServer() as e:
+        comm = await connect(e.address)
+        b = BatchedSend(interval=10)
+        assert b.closed()
+        b.start(comm)
+        assert not b.closed()
+        await b.close()
+        assert b.closed()
+
+
+@pytest.mark.asyncio
+async def test_start_twice_with_closing():
+    async with EchoServer() as e:
+        comm = await connect(e.address)
+        comm2 = await connect(e.address)
+
+        b = BatchedSend(interval=10)
+        b.start(comm)
+
+        # Same comm is fine
+        b.start(comm)
+
+        await b.close()
+
+        b.start(comm2)
+
+        b.send("hello")
+        b.send("world")
+
+        result = await comm2.read()
+        assert result == ("hello", "world")
+
+
+@pytest.mark.asyncio
+async def test_start_twice_with_abort():
+    async with EchoServer() as e:
+        comm = await connect(e.address)
+        comm2 = await connect(e.address)
+
+        b = BatchedSend(interval=10)
+        b.start(comm)
+
+        # Same comm is fine
+        b.start(comm)
+
+        b.abort()
+
+        b.start(comm2)
+
+        b.send("hello")
+        b.send("world")
+
+        result = await comm2.read()
+        assert result == ("hello", "world")
+
+
+@pytest.mark.asyncio
+async def test_start_twice_with_abort_drops_payload():
+    async with EchoServer() as e:
+        comm = await connect(e.address)
+        comm2 = await connect(e.address)
+
+        b = BatchedSend(interval=10)
+        b.start(comm)
+        b.send("hello")
+        b.send("world")
+
+        # Same comm is fine
+        b.start(comm)
+
+        b.abort()
+
+        b.start(comm2)
+
+        with pytest.raises(asyncio.TimeoutError):
+            res = await asyncio.wait_for(comm2.read(), 0.01)
+            assert not res
+
+
+@pytest.mark.asyncio
+async def test_start_closed_comm():
+    async with EchoServer() as e:
+        comm = await connect(e.address)
+        await comm.close()
+
+        b = BatchedSend(interval="10ms")
+        with pytest.raises(RuntimeError, match="Comm already closed."):
+            b.start(comm)
+
+
+@pytest.mark.asyncio
+async def test_start_twice_without_closing():
+    async with EchoServer() as e:
+        comm = await connect(e.address)
+        comm2 = await connect(e.address)
+
+        b = BatchedSend(interval=10)
+        b.start(comm)
+
+        # Same comm is fine
+        b.start(comm)
+
+        # different comm only allowed if already closed
+        with pytest.raises(RuntimeError, match="BatchedSend already started"):
+            b.start(comm2)
+
+        b.send("hello")
+        b.send("world")
+
+        result = await comm.read()
+        assert result == ("hello", "world")
+
+        # This comm hasn't been used so there should be no message received
+        with pytest.raises(asyncio.TimeoutError):
+            await asyncio.wait_for(comm2.read(), 0.01)
+
+
 @pytest.mark.asyncio
 async def test_send_after_stream_start():
     async with EchoServer() as e:
@@ -113,8 +234,11 @@ async def test_send_before_close():
             await asyncio.sleep(0.01)
             assert time() < start + 5
 
+        msg = "123"
         with pytest.raises(CommClosedError):
-            b.send("123")
+            b.send(msg)
+
+        assert msg not in b.buffer
 
 
 @pytest.mark.asyncio
@@ -249,3 +373,25 @@ async def test_serializers():
         assert "function" in value
 
         assert comm.closed()
+
+
+@pytest.mark.asyncio
+async def test_retain_buffer_commclosed():
+    async with EchoServer() as e:
+        with captured_logger("distributed.batched") as caplog:
+            comm = await connect(e.address)
+
+            b = BatchedSend(interval="1s", serializers=["msgpack"])
+            b.start(comm)
+            b.send("foo")
+            assert b.buffer
+            await comm.close()
+            await asyncio.sleep(1)
+
+        assert "Batched Comm Closed" in caplog.getvalue()
+        assert b.buffer
+
+        new_comm = await connect(e.address)
+        b.start(new_comm)
+        assert await new_comm.read() == ("foo",)
+        assert not b.buffer