diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index 25444361719b0..dba2b793c0de0 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -1,3 +1,3 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -deepspeed >=0.8.2, <=0.9.1; platform_system != "Windows" +deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 25444361719b0..dba2b793c0de0 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -1,3 +1,3 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -deepspeed >=0.8.2, <=0.9.1; platform_system != "Windows" +deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 6f5e888bad7f1..569f9fcafe81e 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -567,19 +567,21 @@ def __init__(self, lr: float = 0.01, num_blocks: int = 5): self.lr = lr self.num_blocks = num_blocks self.prepare_data_per_node = True - - metric = Accuracy(task="multiclass", num_classes=3) if _TM_GE_0_11 else Accuracy() - self.train_acc = metric.clone() - self.valid_acc = metric.clone() - self.test_acc = metric.clone() - + self.train_acc = self.valid_acc = self.test_acc = None self.model = None def make_block(self): return nn.Sequential(nn.Linear(32, 32, bias=False), nn.ReLU()) def configure_sharded_model(self) -> None: + # As of deepspeed v0.9.3, in ZeRO stage 3 all submodules need to be created within this hook, + # including the metrics. Otherwise, modules that aren't affected by `deepspeed.zero.Init()` + # won't be moved to the GPU. See https://github.com/microsoft/DeepSpeed/pull/3611 if self.model is None: + metric = Accuracy(task="multiclass", num_classes=3) if _TM_GE_0_11 else Accuracy() + self.train_acc = metric.clone() + self.valid_acc = metric.clone() + self.test_acc = metric.clone() self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3)) def forward(self, x):