learning-at-home · justheuristic · Apr 26, 2021 · Apr 25, 2021 · Apr 25, 2021 · Apr 25, 2021
diff --git a/hivemind/client/averaging/__init__.py b/hivemind/client/averaging/__init__.py
@@ -257,21 +257,24 @@ async def _step(self, *, future: MPFuture, gather_binary: bytes, weight: float,
                 # averaging is finished, exit the loop
                 future.set_result(allreduce_runner.gathered)
 
-            except (AllreduceException, MatchmakingException, AssertionError,
-                    asyncio.InvalidStateError, grpc.RpcError, grpc.aio.AioRpcError, InternalError) as e:
+            except (AllreduceException, MatchmakingException, AssertionError, StopAsyncIteration, InternalError,
+                    asyncio.CancelledError, asyncio.InvalidStateError, grpc.RpcError, grpc.aio.AioRpcError) as e:
                 time_elapsed = get_dht_time() - start_time
                 if not allow_retries or (timeout is not None and timeout < time_elapsed):
-                    logger.warning(f"Averager caught {e}")
-                    future.set_result(None)
+                    logger.exception(f"Averager caught {repr(e)}")
+                    future.set_exception(e)
                 else:
-                    logger.warning(f"Averager caught {e}, retrying")
+                    logger.warning(f"Averager caught {repr(e)}, retrying")
 
-            except Exception as e:
+            except BaseException as e:
                 future.set_exception(e)
                 raise
             finally:
                 _ = self._running_groups.pop(group_id, None)
                 self._pending_group_assembled.set()
+                if not future.done():
+                    future.set_exception(RuntimeError("Internal sanity check failed: averager.step left future pending."
+                                                      " Please report this to hivemind issues."))
 
     async def _make_allreduce_runner(self, group_info: GroupInfo, min_vector_size: int, **kwargs) -> AllReduceRunner:
         """ Use a group description found by Matchmaking to form AllreduceRunner """

diff --git a/hivemind/optim/collaborative.py b/hivemind/optim/collaborative.py
@@ -83,7 +83,7 @@ def __init__(self, opt: torch.optim.Optimizer, *, dht: DHT, prefix: str, target_
                  batch_size_per_step: Optional[int] = None, scheduler: Optional[LRSchedulerBase] = None,
                  min_refresh_period: float = 0.5, max_refresh_period: float = 30, default_refresh_period: float = 3,
                  expected_drift_peers: float = 3, expected_drift_rate: float = 0.2, performance_ema_alpha: float = 0.1,
-                 metadata_expiration: float = 30.0, averaging_timeout: Optional[float] = None, step_tolerance: int = 1,
+                 metadata_expiration: float = 60.0, averaging_timeout: Optional[float] = None, step_tolerance: int = 1,
                  reuse_grad_buffers: bool = False, accumulate_grads_on: Optional[torch.device] = None,
                  client_mode: bool = False, verbose: bool = False, **kwargs):
         super().__init__(opt, dht)