From 2b742dd2ccd899f24111b0fba4d3726e65b8c88e Mon Sep 17 00:00:00 2001
From: Phil Wang <lucidrains@gmail.com>
Date: Mon, 11 Jul 2022 21:02:06 -0700
Subject: [PATCH] move accelerator backward outside of autocast context, also
 calculate total loss correctly across gradient accumulated steps

---
 README.md                                            |  2 +-
 .../denoising_diffusion_pytorch.py                   | 12 +++++++++---
 setup.py                                             |  2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ec9b9c0c8..6e4aad12a 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ trainer = Trainer(
     diffusion,
     'path/to/your/images',
     train_batch_size = 32,
-    train_lr = 1e-4,
+    train_lr = 8e-5,
     train_num_steps = 700000,         # total training steps
     gradient_accumulate_every = 2,    # gradient accumulation steps
     ema_decay = 0.995,                # exponential moving average decay
diff --git a/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py b/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
index 4af427e21..549e68303 100644
--- a/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+++ b/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
@@ -681,6 +681,7 @@ def __init__(
         train_num_steps = 100000,
         ema_update_every = 10,
         ema_decay = 0.995,
+        adam_betas = (0.9, 0.99),
         save_and_sample_every = 1000,
         num_samples = 25,
         results_folder = './results',
@@ -719,7 +720,7 @@ def __init__(
 
         # optimizer
 
-        self.opt = Adam(diffusion_model.parameters(), lr = train_lr)
+        self.opt = Adam(diffusion_model.parameters(), lr = train_lr, betas = adam_betas)
 
         # for logging results in a folder periodically
 
@@ -772,14 +773,19 @@ def train(self):
 
             while self.step < self.train_num_steps:
 
+                total_loss = 0.
+
                 for _ in range(self.gradient_accumulate_every):
                     data = next(self.dl).to(device)
 
                     with self.accelerator.autocast():
                         loss = self.model(data)
-                        self.accelerator.backward(loss / self.gradient_accumulate_every)
+                        loss = loss / self.gradient_accumulate_every
+                        total_loss += loss.item()
+
+                    self.accelerator.backward(loss)
 
-                pbar.set_description(f'loss: {loss.item():.4f}')
+                pbar.set_description(f'loss: {total_loss:.4f}')
 
                 accelerator.wait_for_everyone()
 
diff --git a/setup.py b/setup.py
index 0f9f2fb5c..00d1754ca 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'denoising-diffusion-pytorch',
   packages = find_packages(),
-  version = '0.25.2',
+  version = '0.25.3',
   license='MIT',
   description = 'Denoising Diffusion Probabilistic Models - Pytorch',
   author = 'Phil Wang',