diff --git a/keras/mixed_precision/loss_scale_optimizer.py b/keras/mixed_precision/loss_scale_optimizer.py index b1a95abae27..2f0bc20fbcd 100644 --- a/keras/mixed_precision/loss_scale_optimizer.py +++ b/keras/mixed_precision/loss_scale_optimizer.py @@ -406,14 +406,14 @@ class BaseLossScaleOptimizer(metaclass=LossScaleOptimizerMetaclass): Args: inner_optimizer: The `tf.keras.optimizers.Optimizer` or `tf.keras.optimizers.experimental.Optimizer` instance to wrap. - dynamic: Bool indicating whether dynamic loss scaling is used. Defaults to - True. If True, the loss scale will be dynamically updated over time - using an algorithm that keeps the loss scale at approximately its - optimal value. If False, a single fixed loss scale is used and - `initial_scale` must be specified, which is used as the loss scale. + dynamic: Bool indicating whether dynamic loss scaling is used. If True, + the loss scale will be dynamically updated over time using an algorithm + that keeps the loss scale at approximately its optimal value. If False, + a single fixed loss scale is used and `initial_scale` must be + specified, which is used as the loss scale. Recommended to keep as True, as choosing a fixed loss scale can be tricky. Currently, there is a small performance overhead to dynamic loss - scaling compared to fixed loss scaling. + scaling compared to fixed loss scaling. Defaults to `True`. initial_scale: The initial loss scale. If `dynamic` is True, this defaults to `2 ** 15`. If `dynamic` is False, this must be specified and acts as the sole loss scale, as the loss scale does not change over time. When @@ -422,11 +422,11 @@ class BaseLossScaleOptimizer(metaclass=LossScaleOptimizerMetaclass): quickly than a loss scale that is too low gets raised. dynamic_growth_steps: With dynamic loss scaling, every `dynamic_growth_steps` steps with finite gradients, the loss scale is - doubled. Defaults to 2000. If a nonfinite gradient is encountered, the + doubled. If a nonfinite gradient is encountered, the count is reset back to zero, gradients are skipped that step, and the loss scale is halved. The count can be queried with `LossScaleOptimizer.dynamic_counter`. This argument can only be - specified if `dynamic` is True. + specified if `dynamic` is True. Defaults to `2000`. `LossScaleOptimizer` will occasionally skip applying gradients to the variables, in which case the trainable variables will not change that step.