diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py index a1cfadb9886b..ba97e336a2bb 100644 --- a/deepspeed/runtime/bf16_optimizer.py +++ b/deepspeed/runtime/bf16_optimizer.py @@ -59,7 +59,6 @@ def __init__(self, self.grad_acc_dtype = grad_acc_dtype self.immediate_grad_update = bfloat16_config.immediate_grad_update - self.check_overflow = bfloat16_config.check_overflow self.clip_grad = clip_grad self.norm_type = norm_type diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py index 7312cd20e557..579a779068b0 100755 --- a/deepspeed/runtime/fp16/loss_scaler.py +++ b/deepspeed/runtime/fp16/loss_scaler.py @@ -116,18 +116,17 @@ class DynamicLossScaler(LossScalerBase): """ def __init__(self, - init_scale=2**32, - scale_factor=2., - scale_window=1000, - min_scale=1, - delayed_shift=1, - consecutive_hysteresis=False, + init_scale, + scale_window, + min_scale, + delayed_shift, + consecutive_hysteresis, raise_error_at_min_scale=True, dtype=torch.half): super(DynamicLossScaler, self).__init__(init_scale) self.cur_iter = 0 self.last_overflow_iter = -1 - self.scale_factor = scale_factor + self.scale_factor = 2.0 self.scale_window = scale_window self.min_scale = min_scale self.delayed_shift = delayed_shift @@ -209,8 +208,7 @@ def update_scale(self, overflow): # we still create a scaler for other dtypes (fp32, bf16) which does not perform any scaling. def CreateLossScaler(dtype, static_loss_scale, dynamic_scaling, dynamic_loss_args): if dtype == torch.half and dynamic_scaling: - if dynamic_loss_args is None: - return DynamicLossScaler(dtype=dtype) + assert dynamic_loss_args is not None, f"Dynamic loss scaling parameters must be defined." return DynamicLossScaler(dtype=dtype, **dynamic_loss_args) loss_scale_value = static_loss_scale if dtype == torch.half else 1.0