Skip to content

Commit

Permalink
add more detailed logging for fp16 diverging
Browse files Browse the repository at this point in the history
Summary: We often get a generic "minimum loss scale reached" when fp16 training diverges. Would be useful to have a breakdown on where exactly the gradient norm becomes too big.

Reviewed By: myleott

Differential Revision: D23297774

fbshipit-source-id: 69da1cca1be22f15af633f8efe4e7b491cf4f6f9
  • Loading branch information
Alex Xiao authored and facebook-github-bot committed Aug 29, 2020
1 parent 4bfd70d commit 0989eca
Showing 1 changed file with 14 additions and 0 deletions.
14 changes: 14 additions & 0 deletions fairseq/nan_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(self, model, forward=True, backward=True):
self.fhooks = []
self.forward = forward
self.backward = backward
self.model = model
self.reset()

for name, mod in model.named_modules():
Expand All @@ -29,6 +30,19 @@ def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, exc_traceback):
# Dump out all model gnorms to enable better debugging
norm = {}
gradients = {}
for name, param in self.model.named_parameters():
grad_norm = torch.norm(param.grad.data, p=2, dtype=torch.float32)
norm[name] = grad_norm.item()
if torch.isnan(grad_norm).any() or torch.isinf(grad_norm).any():
gradients[name] = param.grad.data
if len(gradients) > 0:
logger.info("Detected nan/inf grad norm, dumping norms...")
logger.info(f"norms: {norm}")
logger.info(f"gradients: {gradients}")

self.close()

def add_hooks(self, module):
Expand Down

0 comments on commit 0989eca

Please sign in to comment.