diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt
index 25444361719b0..dba2b793c0de0 100644
--- a/requirements/fabric/strategies.txt
+++ b/requirements/fabric/strategies.txt
@@ -1,3 +1,3 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-deepspeed >=0.8.2, <=0.9.1; platform_system != "Windows"
+deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows"
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 25444361719b0..dba2b793c0de0 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -1,3 +1,3 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-deepspeed >=0.8.2, <=0.9.1; platform_system != "Windows"
+deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows"
diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
index 6f5e888bad7f1..569f9fcafe81e 100644
--- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
+++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
@@ -567,19 +567,21 @@ def __init__(self, lr: float = 0.01, num_blocks: int = 5):
         self.lr = lr
         self.num_blocks = num_blocks
         self.prepare_data_per_node = True
-
-        metric = Accuracy(task="multiclass", num_classes=3) if _TM_GE_0_11 else Accuracy()
-        self.train_acc = metric.clone()
-        self.valid_acc = metric.clone()
-        self.test_acc = metric.clone()
-
+        self.train_acc = self.valid_acc = self.test_acc = None
         self.model = None
 
     def make_block(self):
         return nn.Sequential(nn.Linear(32, 32, bias=False), nn.ReLU())
 
     def configure_sharded_model(self) -> None:
+        # As of deepspeed v0.9.3, in ZeRO stage 3 all submodules need to be created within this hook,
+        # including the metrics. Otherwise, modules that aren't affected by `deepspeed.zero.Init()`
+        # won't be moved to the GPU. See https://github.com/microsoft/DeepSpeed/pull/3611
         if self.model is None:
+            metric = Accuracy(task="multiclass", num_classes=3) if _TM_GE_0_11 else Accuracy()
+            self.train_acc = metric.clone()
+            self.valid_acc = metric.clone()
+            self.test_acc = metric.clone()
             self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3))
 
     def forward(self, x):