#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
from typing import Optional, Union
import torch
from pytext.config import ConfigBase
from pytext.config.component import Component, ComponentType, create_scheduler
from pytext.optimizer import Optimizer
from torch.optim.lr_scheduler import (
CosineAnnealingLR as TorchCosineAnnealingLR,
CyclicLR as TorchCyclicLR,
ExponentialLR as TorchExponentialLR,
ReduceLROnPlateau as TorchReduceLROnPlateau,
StepLR as TorchStepLR,
_LRScheduler,
)
[docs]class Scheduler(Component):
"""
Schedulers help in adjusting the learning rate during training. Scheduler
is a wrapper class over schedulers which can be available in torch
library or for custom implementations. There are two kinds of lr scheduling
that is supported by this class. Per epoch scheduling and per batch scheduling.
In per epoch scheduling, the learning rate is adjusted at the end of each epoch
and in per batch scheduling the learning rate is adjusted after the forward and
backward pass through one batch during the training.
There are two main methods that needs to be implemented by the Scheduler.
step_epoch() is called at the end of each epoch and step_batch() is called
at the end of each batch in the training data.
prepare() method can be used by BatchSchedulers to initialize any attributes
they may need.
"""
__COMPONENT_TYPE__ = ComponentType.SCHEDULER
__EXPANSIBLE__ = True
[docs] class Config(ConfigBase):
pass
[docs] def step_batch(self, **kwargs) -> None:
pass
[docs] def step_epoch(self, **kwargs) -> None:
pass
[docs] def prepare(self, train_iter, total_epochs):
pass
[docs]class BatchScheduler(Scheduler):
[docs] def prepare(self, train_iter, total_epochs):
self.num_epochs = total_epochs
self.steps_per_epoch = getattr(train_iter, "total_num_batches", None)
[docs]class LmFineTuning(_LRScheduler, BatchScheduler):
"""
Fine-tuning methods from the paper
"[arXiv:1801.06146]Universal Language Model Fine-tuning for Text Classification".
Specifically, modifies training schedule using slanted triangular learning rates,
discriminative fine-tuning (per-layer learning rates), and gradual unfreezing.
"""
[docs] class Config(Scheduler.Config):
#: The fraction of iterations we increase the learning rate. Default 0.1
cut_frac: float = 0.1
#: How much smaller the lowest LR is from the maximum LR eta_max.
ratio: int = 32
#: Number of param_groups, starting from the
#: end, that were not pretrained. The default value is 2, since the base Model
#: class supplies to the optimizer typically one param_group from the embedding
#: and one param_group from its other components.
non_pretrained_param_groups: int = 2
#: Factor to multiply lr for all pretrained layers by.
lm_lr_multiplier: float = 1.0
#: Whether to make each pretrained layer's lr
#: one-half as large as the next (higher) layer.
lm_use_per_layer_lr: bool = False
#: Whether to unfreeze layers one by one (per epoch).
lm_gradual_unfreezing: bool = True
#: Though the name is `last_epoch`, it means `last batch update`.
#: last_batch_update: = current_epoch_number * num_batches_per_epoch + batch_id
#: after each batch update, it will increment 1
last_epoch: int = -1
def __init__(
self,
optimizer,
cut_frac=0.1,
ratio=32,
non_pretrained_param_groups=2,
lm_lr_multiplier=1.0,
lm_use_per_layer_lr=False,
lm_gradual_unfreezing=True,
last_epoch=-1,
):
assert isinstance(optimizer, torch.optim.Adam)
self.num_epochs = None # to be set later by Trainer
self.steps_per_epoch = None # to be set later by Trainer
self.cut_frac = cut_frac
self.ratio = ratio
self.lm_pretrained_layers = (
len(optimizer.param_groups) - non_pretrained_param_groups
)
assert self.lm_pretrained_layers >= 0
assert non_pretrained_param_groups > 0
self.lm_lr_multiplier = lm_lr_multiplier
self.lm_use_per_layer_lr = lm_use_per_layer_lr
self.lm_gradual_unfreezing = lm_gradual_unfreezing
super(LmFineTuning, self).__init__(optimizer, last_epoch)
[docs] @classmethod
def from_config(cls, config: Config, optimizer):
return cls(
optimizer,
config.cut_frac,
config.ratio,
config.non_pretrained_param_groups,
config.lm_lr_multiplier,
config.lm_use_per_layer_lr,
config.lm_gradual_unfreezing,
config.last_epoch,
)
[docs] def get_lr(self):
if self.num_epochs is None or self.steps_per_epoch is None:
return [1.0] * len(self.base_lrs)
slanted_multiplier = self._slanted_multiplier()
return [
(
slanted_multiplier
* self._lm_layer_multiplier(i)
* self._lm_frozen_multiplier(i)
* base_lr
)
for i, base_lr in enumerate(self.base_lrs)
]
def _slanted_multiplier(self):
phase_step = self.last_epoch
phase_total_steps = self.num_epochs * self.steps_per_epoch
if phase_step > phase_total_steps:
return 1.0 / self.ratio
if self.lm_gradual_unfreezing:
unfreeze_steps = self.lm_pretrained_layers * self.steps_per_epoch
if self.last_epoch > unfreeze_steps:
phase_step -= unfreeze_steps
phase_total_steps -= unfreeze_steps
else:
phase_step %= self.steps_per_epoch
phase_total_steps = self.steps_per_epoch
cut = math.floor(self.cut_frac * phase_total_steps)
if phase_step < cut:
p = phase_step / cut
else:
p = 1.0 - (phase_step - cut) / (phase_total_steps - cut)
return (1.0 + p * (self.ratio - 1.0)) / self.ratio
def _lm_layer_multiplier(self, layer_index):
multiplier = 1.0
if layer_index < self.lm_pretrained_layers:
multiplier *= self.lm_lr_multiplier
if self.lm_use_per_layer_lr:
multiplier *= 2 ** (layer_index - self.lm_pretrained_layers)
return multiplier
def _lm_frozen_multiplier(self, layer_index):
return 0.0 if self._lm_frozen(layer_index) else 1.0
def _lm_frozen(self, layer_index):
if not self.lm_gradual_unfreezing:
return False
if layer_index >= self.lm_pretrained_layers:
return False
epoch = self.last_epoch / self.steps_per_epoch
return epoch < self.lm_pretrained_layers - layer_index
[docs] def step_batch(self, metrics=None, epoch=None):
self.step(epoch)
[docs]class StepLR(TorchStepLR, Scheduler):
"""
Wrapper around `torch.optim.lr_scheduler.StepLR`
See the original documentation for more details.
"""
[docs] class Config(Scheduler.Config):
#: Period of learning rate decay.
step_size: int = 30
#: Multiplicative factor of learning rate decay.
gamma: float = 0.1
[docs] @classmethod
def from_config(cls, config: Config, optimizer):
return cls(optimizer, config.step_size, config.gamma)
[docs] def step_epoch(self, metrics=None, epoch=None):
self.step(epoch)
[docs]class ReduceLROnPlateau(TorchReduceLROnPlateau, Scheduler):
"""
Wrapper around `torch.optim.lr_scheduler.ReduceLROnPlateau`
See the original documentation for more details.
"""
[docs] class Config(Scheduler.Config):
#: This indicates the desirable direction in which we would like the
#: training to proceed. If set to true, learning rate will be reduce
#: when quantity being monitored stops going down
lower_is_better: bool = True
#: Factor by which the learning rate will be reduced. new_lr = lr * factor
factor: float = 0.1
#: Number of epochs with no improvement after which learning rate will
#: be reduced
patience: int = 5
#: Lower bound on the learning rate of all param groups
min_lr: float = 0
#: Threshold for measuring the new optimum, to only focus on significant
#: changes.
threshold: float = 0.0001
#: One of rel, abs.
#: In rel mode, dynamic_threshold = best * ( 1 + threshold ) in ‘max’ mode
#: or best * ( 1 - threshold ) in min mode.
#: In abs mode, dynamic_threshold = best + threshold in max mode or
#: best - threshold in min mode.
threshold_is_absolute: bool = True
#: Number of epochs to wait before resuming normal operation after
#: lr has been reduced.
cooldown: int = 0
[docs] @classmethod
def from_config(cls, config: Config, optimizer: Optimizer):
return cls(
optimizer,
mode="min" if config.lower_is_better else "max",
factor=config.factor,
patience=config.patience,
min_lr=config.min_lr,
threshold=config.threshold,
threshold_mode=("abs" if config.threshold_is_absolute else "rel"),
cooldown=config.cooldown,
)
[docs] def step_epoch(self, metrics, epoch):
self.step(metrics, epoch)
[docs]class CosineAnnealingLR(TorchCosineAnnealingLR, BatchScheduler):
"""
Wrapper around `torch.optim.lr_scheduler.CosineAnnealingLR`
See the original documentation for more details.
"""
[docs] class Config(Scheduler.Config):
#: Maximum number of iterations.
t_max: int = 1000
#: Minimum learning rate
eta_min: float = 0
[docs] @classmethod
def from_config(cls, config: Config, optimizer: Optimizer):
return cls(optimizer, config.t_max, config.eta_min)
[docs] def step_batch(self, metrics=None, epoch=None):
self.step(epoch)
[docs]class CyclicLR(TorchCyclicLR, BatchScheduler):
"""
Wrapper around `torch.optim.lr_scheduler.CyclicLR`
See the original documentation for more details
"""
[docs] class Config(Scheduler.Config):
base_lr: float = 0.001
max_lr: float = 0.002
step_size_up: int = 2000
step_size_down: Optional[int] = None
mode: str = "triangular"
gamma: float = 1.0
scale_mode: str = "cycle"
cycle_momentum: bool = True
base_momentum: float = 0.8
max_momentum: float = 0.9
last_epoch: int = -1
[docs] @classmethod
def from_config(cls, config: Config, optimizer: Optimizer):
return cls(
optimizer=optimizer,
base_lr=config.base_lr,
max_lr=config.max_lr,
step_size_up=config.step_size_up,
step_size_down=config.step_size_down,
mode=config.mode,
gamma=config.gamma,
scale_mode=config.scale_mode,
cycle_momentum=config.cycle_momentum,
base_momentum=config.base_momentum,
max_momentum=config.max_momentum,
last_epoch=config.last_epoch,
)
[docs] def step_batch(self, metrics=None, epoch=None):
self.step(epoch)
[docs]class ExponentialLR(TorchExponentialLR, Scheduler):
"""
Wrapper around `torch.optim.lr_scheduler.ExponentialLR`
See the original documentation for more details.
"""
[docs] class Config(Scheduler.Config):
#: Multiplicative factor of learning rate decay.
gamma: float = 0.1
[docs] @classmethod
def from_config(cls, config: Config, optimizer: Optimizer):
return cls(optimizer, config.gamma)
[docs] def step_epoch(self, metrics=None, epoch=None):
self.step(epoch)
[docs]class WarmupScheduler(_LRScheduler, BatchScheduler):
"""
Scheduler to linearly increase the learning rate from 0 to its final value over
a number of steps:
lr = base_lr * current_step / warmup_steps
After the warm-up phase, the scheduler has the option of decaying the learning
rate as the inverse square root of the number of training steps taken:
lr = base_lr * sqrt(warmup_steps) / sqrt(current_step)
"""
[docs] class Config(BatchScheduler.Config):
#: number of training steps over which to increase learning rate
warmup_steps: int = 10000
#: whether to perform inverse sqrt decay after the warmup phase
inverse_sqrt_decay: bool = False
[docs] @classmethod
def from_config(cls, config: Config, optimizer: Optimizer):
return cls(optimizer, config.warmup_steps, config.inverse_sqrt_decay)
def __init__(self, optimizer, warmup_steps, inverse_sqrt_decay):
assert warmup_steps > 0
self.warmup_steps = warmup_steps
self.current_steps = 0
self.inverse_sqrt_decay = inverse_sqrt_decay
self.decay_factor = warmup_steps ** 0.5
super().__init__(optimizer)
[docs] def prepare(self, train_iter, total_epochs):
super().prepare(train_iter, total_epochs)
self.step_batch() # initialize learning rate
[docs] def step_batch(self):
self.current_steps += 1
self.step()
[docs] def get_lr(self):
if self.current_steps >= self.warmup_steps:
if self.inverse_sqrt_decay:
lr_multiplier = self.decay_factor / (self.current_steps ** 0.5)
else:
lr_multiplier = 1.0
else:
lr_multiplier = self.current_steps / self.warmup_steps
return [lr_multiplier * base_lr for base_lr in self.base_lrs]
[docs]class PolynomialDecayScheduler(_LRScheduler, BatchScheduler):
"""
Applies a polynomial decay with lr warmup to the learning rate.
It is commonly observed that a monotonically decreasing learning rate, whose
degree of change is carefully chosen, results in a better performing model.
This scheduler linearly increase learning rate from 0 to final value at the
beginning of training, determined by warmup_steps.
Then it applies a polynomial decay function to an optimizer step, given a
provided `base_lrs` to reach an `end_learning_rate` after `total_steps`.
"""
[docs] class Config(BatchScheduler.Config):
#: number of training steps over which to increase learning rate
warmup_steps: int = 0
#: number of training steps for learning rate decay
total_steps: int
#: end learning rate after `total_steps` of training
end_learning_rate: float
#: power used for polynomial decay calculation
power: float = 1.0
[docs] @classmethod
def from_config(cls, config: Config, optimizer: Optimizer):
return cls(
optimizer,
config.warmup_steps,
config.total_steps,
config.end_learning_rate,
config.power,
)
def __init__(self, optimizer, warmup_steps, total_steps, end_learning_rate, power):
assert total_steps > warmup_steps >= 0
self.warmup_steps = warmup_steps
self.total_steps = total_steps
self.end_learning_rate = end_learning_rate
self.power = power
self.current_steps = 0
super().__init__(optimizer)
[docs] def prepare(self, train_iter, total_epochs):
super().prepare(train_iter, total_epochs)
self.step_batch() # initialize learning rate
[docs] def get_lr(self):
if self.current_steps <= self.warmup_steps:
# during warmup the learning rate linearly increases until
# it reaches base_lr.
warmup_factor = self.current_steps / self.warmup_steps
lrs = [warmup_factor * base_lr for base_lr in self.base_lrs]
elif self.current_steps <= self.total_steps:
# start polynomial weight decay until it reaches end_learning_rate
decay_factor = (
1
- (self.current_steps - self.warmup_steps)
/ (self.total_steps - self.warmup_steps)
) ** self.power
lrs = [
(base_lr - self.end_learning_rate) * decay_factor
+ self.end_learning_rate
for base_lr in self.base_lrs
]
else:
# reach end_learning_rate after total_steps
lrs = [self.end_learning_rate for _ in self.base_lrs]
return lrs
[docs] def step_batch(self):
self.current_steps += 1
# update optimizer.param_groups's learning rate
self.step()
[docs]class SchedulerWithWarmup(_LRScheduler, BatchScheduler):
"""
Wraps another scheduler with a warmup phase. After `warmup_steps` defined in
warmup_scheduler.warmup_steps, the scheduler will switch to use the specified
scheduler in `scheduler`.
`warmup_scheduler`: is the configuration for the WarmupScheduler, that warms up
learning rate over `warmup_steps` linearly.
`scheduler`: is the main scheduler that will be applied after the warmup phase
(once `warmup_steps` have passed)
"""
[docs] class Config(BatchScheduler.Config):
# the definition of the warmup scheduler for the warmup phase
warmup_scheduler: WarmupScheduler.Config = WarmupScheduler.Config()
# the definition of the main scheduler to apply once the warmup phase
# has passed
scheduler: Union[
ExponentialLR.Config,
CosineAnnealingLR.Config,
ReduceLROnPlateau.Config,
LmFineTuning.Config,
CyclicLR.Config,
]
[docs] @classmethod
def from_config(cls, config: Config, optimizer: Optimizer):
warmup_scheduler = create_scheduler(config.warmup_scheduler, optimizer)
scheduler = create_scheduler(config.scheduler, optimizer)
return cls(
optimizer, warmup_scheduler, scheduler, config.warmup_scheduler.warmup_steps
)
[docs] def prepare(self, train_iter, total_epochs):
super().prepare(train_iter, total_epochs)
self.warmup_scheduler.prepare(train_iter, total_epochs)
self.scheduler.prepare(train_iter, total_epochs)
def __init__(self, optimizer, warmup_scheduler, scheduler, switch_steps):
self.optimizer = optimizer
self.warmup_scheduler = warmup_scheduler
self.scheduler = scheduler
self.switch_steps = switch_steps
self.curr_steps = 0
[docs] def step_batch(self):
if self.curr_steps < self.switch_steps:
self.curr_steps += 1
return self.warmup_scheduler.step_batch()
else:
return self.scheduler.step_batch()
[docs] def step_epoch(self, metrics, epoch):
if self.curr_steps < self.switch_steps:
return self.warmup_scheduler.step_epoch(metrics=metrics, epoch=epoch)
else:
return self.scheduler.step_epoch(metrics=metrics, epoch=None)
[docs] def get_lr(self):
if self.curr_steps < self.switch_steps:
return self.warmup_scheduler.get_lr()
else:
return self.scheduler.get_lr()