Source code for pytext.optimizer.scheduler

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
from typing import Optional, Union

import torch
from pytext.config import ConfigBase
from pytext.config.component import Component, ComponentType, create_scheduler
from pytext.optimizer import Optimizer
from torch.optim.lr_scheduler import (
    CosineAnnealingLR as TorchCosineAnnealingLR,
    CyclicLR as TorchCyclicLR,
    ExponentialLR as TorchExponentialLR,
    ReduceLROnPlateau as TorchReduceLROnPlateau,
    StepLR as TorchStepLR,
    _LRScheduler,
)


[docs]class Scheduler(Component):
    """
    Schedulers help in adjusting the learning rate during training. Scheduler
    is a wrapper class over schedulers which can be available in torch
    library or for custom implementations. There are two kinds of lr scheduling
    that is supported by this class. Per epoch scheduling and per batch scheduling.
    In per epoch scheduling, the learning rate is adjusted at the end of each epoch
    and in per batch scheduling the learning rate is adjusted after the forward and
    backward pass through one batch during the training.

    There are two main methods that needs to be implemented by the Scheduler.
    step_epoch() is called at the end of each epoch and step_batch() is called
    at the end of each batch in the training data.

    prepare() method can be used by BatchSchedulers to initialize any attributes
    they may need.

    """

    __COMPONENT_TYPE__ = ComponentType.SCHEDULER
    __EXPANSIBLE__ = True

[docs]    class Config(ConfigBase):
        pass

[docs]    def step_batch(self, **kwargs) -> None:
        pass

[docs]    def step_epoch(self, **kwargs) -> None:
        pass

[docs]    def prepare(self, train_iter, total_epochs):
        pass


[docs]class BatchScheduler(Scheduler):
[docs]    def prepare(self, train_iter, total_epochs):
        self.num_epochs = total_epochs
        self.steps_per_epoch = getattr(train_iter, "total_num_batches", None)


[docs]class LmFineTuning(_LRScheduler, BatchScheduler):
    """
    Fine-tuning methods from the paper
    "[arXiv:1801.06146]Universal Language Model Fine-tuning for Text Classification".

    Specifically, modifies training schedule using slanted triangular learning rates,
    discriminative fine-tuning (per-layer learning rates), and gradual unfreezing.
    """

[docs]    class Config(Scheduler.Config):
        #: The fraction of iterations we increase the learning rate. Default 0.1
        cut_frac: float = 0.1
        #: How much smaller the lowest LR is from the maximum LR eta_max.
        ratio: int = 32
        #: Number of param_groups, starting from the
        #: end, that were not pretrained. The default value is 2, since the base Model
        #: class supplies to the optimizer typically one param_group from the embedding
        #: and one param_group from its other components.
        non_pretrained_param_groups: int = 2
        #: Factor to multiply lr for all pretrained layers by.
        lm_lr_multiplier: float = 1.0
        #: Whether to make each pretrained layer's lr
        #:    one-half as large as the next (higher) layer.
        lm_use_per_layer_lr: bool = False
        #: Whether to unfreeze layers one by one (per epoch).
        lm_gradual_unfreezing: bool = True
        #: Though the name is `last_epoch`, it means `last batch update`.
        #: last_batch_update: = current_epoch_number * num_batches_per_epoch + batch_id
        #: after each batch update, it will increment 1
        last_epoch: int = -1

    def __init__(
        self,
        optimizer,
        cut_frac=0.1,
        ratio=32,
        non_pretrained_param_groups=2,
        lm_lr_multiplier=1.0,
        lm_use_per_layer_lr=False,
        lm_gradual_unfreezing=True,
        last_epoch=-1,
    ):
        assert isinstance(optimizer, torch.optim.Adam)
        self.num_epochs = None  # to be set later by Trainer
        self.steps_per_epoch = None  # to be set later by Trainer
        self.cut_frac = cut_frac
        self.ratio = ratio

        self.lm_pretrained_layers = (
            len(optimizer.param_groups) - non_pretrained_param_groups
        )
        assert self.lm_pretrained_layers >= 0
        assert non_pretrained_param_groups > 0

        self.lm_lr_multiplier = lm_lr_multiplier
        self.lm_use_per_layer_lr = lm_use_per_layer_lr
        self.lm_gradual_unfreezing = lm_gradual_unfreezing
        super(LmFineTuning, self).__init__(optimizer, last_epoch)

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer):
        return cls(
            optimizer,
            config.cut_frac,
            config.ratio,
            config.non_pretrained_param_groups,
            config.lm_lr_multiplier,
            config.lm_use_per_layer_lr,
            config.lm_gradual_unfreezing,
            config.last_epoch,
        )

[docs]    def get_lr(self):
        if self.num_epochs is None or self.steps_per_epoch is None:
            return [1.0] * len(self.base_lrs)

        slanted_multiplier = self._slanted_multiplier()
        return [
            (
                slanted_multiplier
                * self._lm_layer_multiplier(i)
                * self._lm_frozen_multiplier(i)
                * base_lr
            )
            for i, base_lr in enumerate(self.base_lrs)
        ]

    def _slanted_multiplier(self):
        phase_step = self.last_epoch
        phase_total_steps = self.num_epochs * self.steps_per_epoch

        if phase_step > phase_total_steps:
            return 1.0 / self.ratio

        if self.lm_gradual_unfreezing:
            unfreeze_steps = self.lm_pretrained_layers * self.steps_per_epoch

            if self.last_epoch > unfreeze_steps:
                phase_step -= unfreeze_steps
                phase_total_steps -= unfreeze_steps
            else:
                phase_step %= self.steps_per_epoch
                phase_total_steps = self.steps_per_epoch

        cut = math.floor(self.cut_frac * phase_total_steps)
        if phase_step < cut:
            p = phase_step / cut
        else:
            p = 1.0 - (phase_step - cut) / (phase_total_steps - cut)

        return (1.0 + p * (self.ratio - 1.0)) / self.ratio

    def _lm_layer_multiplier(self, layer_index):
        multiplier = 1.0

        if layer_index < self.lm_pretrained_layers:
            multiplier *= self.lm_lr_multiplier

            if self.lm_use_per_layer_lr:
                multiplier *= 2 ** (layer_index - self.lm_pretrained_layers)

        return multiplier

    def _lm_frozen_multiplier(self, layer_index):
        return 0.0 if self._lm_frozen(layer_index) else 1.0

    def _lm_frozen(self, layer_index):
        if not self.lm_gradual_unfreezing:
            return False

        if layer_index >= self.lm_pretrained_layers:
            return False

        epoch = self.last_epoch / self.steps_per_epoch
        return epoch < self.lm_pretrained_layers - layer_index

[docs]    def step_batch(self, metrics=None, epoch=None):
        self.step(epoch)


[docs]class StepLR(TorchStepLR, Scheduler):
    """
    Wrapper around `torch.optim.lr_scheduler.StepLR`
    See the original documentation for more details.
    """

[docs]    class Config(Scheduler.Config):
        #: Period of learning rate decay.
        step_size: int = 30
        #: Multiplicative factor of learning rate decay.
        gamma: float = 0.1

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer):
        return cls(optimizer, config.step_size, config.gamma)

[docs]    def step_epoch(self, metrics=None, epoch=None):
        self.step(epoch)


[docs]class ReduceLROnPlateau(TorchReduceLROnPlateau, Scheduler):
    """
    Wrapper around `torch.optim.lr_scheduler.ReduceLROnPlateau`
    See the original documentation for more details.
    """

[docs]    class Config(Scheduler.Config):
        #: This indicates the desirable direction in which we would like the
        #: training to proceed. If set to true, learning rate will be reduce
        #: when quantity being monitored stops going down
        lower_is_better: bool = True
        #: Factor by which the learning rate will be reduced. new_lr = lr * factor
        factor: float = 0.1
        #: Number of epochs with no improvement after which learning rate will
        #: be reduced
        patience: int = 5
        #: Lower bound on the learning rate of all param groups
        min_lr: float = 0
        #: Threshold for measuring the new optimum, to only focus on significant
        #: changes.
        threshold: float = 0.0001
        #: One of rel, abs.
        #: In rel mode, dynamic_threshold = best * ( 1 + threshold ) in ‘max’ mode
        #: or best * ( 1 - threshold ) in min mode.
        #: In abs mode, dynamic_threshold = best + threshold in max mode or
        #: best - threshold in min mode.
        threshold_is_absolute: bool = True
        #: Number of epochs to wait before resuming normal operation after
        #: lr has been reduced.
        cooldown: int = 0

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer: Optimizer):
        return cls(
            optimizer,
            mode="min" if config.lower_is_better else "max",
            factor=config.factor,
            patience=config.patience,
            min_lr=config.min_lr,
            threshold=config.threshold,
            threshold_mode=("abs" if config.threshold_is_absolute else "rel"),
            cooldown=config.cooldown,
        )

[docs]    def step_epoch(self, metrics, epoch):
        self.step(metrics, epoch)


[docs]class CosineAnnealingLR(TorchCosineAnnealingLR, BatchScheduler):
    """
    Wrapper around `torch.optim.lr_scheduler.CosineAnnealingLR`
    See the original documentation for more details.
    """

[docs]    class Config(Scheduler.Config):
        #: Maximum number of iterations.
        t_max: int = 1000
        #: Minimum learning rate
        eta_min: float = 0

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer: Optimizer):
        return cls(optimizer, config.t_max, config.eta_min)

[docs]    def step_batch(self, metrics=None, epoch=None):
        self.step(epoch)


[docs]class CyclicLR(TorchCyclicLR, BatchScheduler):
    """
    Wrapper around `torch.optim.lr_scheduler.CyclicLR`
    See the original documentation for more details
    """

[docs]    class Config(Scheduler.Config):
        base_lr: float = 0.001
        max_lr: float = 0.002
        step_size_up: int = 2000
        step_size_down: Optional[int] = None
        mode: str = "triangular"
        gamma: float = 1.0
        scale_mode: str = "cycle"
        cycle_momentum: bool = True
        base_momentum: float = 0.8
        max_momentum: float = 0.9
        last_epoch: int = -1

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer: Optimizer):
        return cls(
            optimizer=optimizer,
            base_lr=config.base_lr,
            max_lr=config.max_lr,
            step_size_up=config.step_size_up,
            step_size_down=config.step_size_down,
            mode=config.mode,
            gamma=config.gamma,
            scale_mode=config.scale_mode,
            cycle_momentum=config.cycle_momentum,
            base_momentum=config.base_momentum,
            max_momentum=config.max_momentum,
            last_epoch=config.last_epoch,
        )

[docs]    def step_batch(self, metrics=None, epoch=None):
        self.step(epoch)


[docs]class ExponentialLR(TorchExponentialLR, Scheduler):
    """
    Wrapper around `torch.optim.lr_scheduler.ExponentialLR`
    See the original documentation for more details.
    """

[docs]    class Config(Scheduler.Config):
        #: Multiplicative factor of learning rate decay.
        gamma: float = 0.1

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer: Optimizer):
        return cls(optimizer, config.gamma)

[docs]    def step_epoch(self, metrics=None, epoch=None):
        self.step(epoch)


[docs]class WarmupScheduler(_LRScheduler, BatchScheduler):
    """
    Scheduler to linearly increase the learning rate from 0 to its final value over
    a number of steps:

        lr = base_lr * current_step / warmup_steps

    After the warm-up phase, the scheduler has the option of decaying the learning
    rate as the inverse square root of the number of training steps taken:

        lr = base_lr * sqrt(warmup_steps) / sqrt(current_step)
    """

[docs]    class Config(BatchScheduler.Config):
        #: number of training steps over which to increase learning rate
        warmup_steps: int = 10000

        #: whether to perform inverse sqrt decay after the warmup phase
        inverse_sqrt_decay: bool = False

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer: Optimizer):
        return cls(optimizer, config.warmup_steps, config.inverse_sqrt_decay)

    def __init__(self, optimizer, warmup_steps, inverse_sqrt_decay):
        assert warmup_steps > 0
        self.warmup_steps = warmup_steps
        self.current_steps = 0
        self.inverse_sqrt_decay = inverse_sqrt_decay
        self.decay_factor = warmup_steps ** 0.5
        super().__init__(optimizer)

[docs]    def prepare(self, train_iter, total_epochs):
        super().prepare(train_iter, total_epochs)
        self.step_batch()  # initialize learning rate

[docs]    def step_batch(self):
        self.current_steps += 1
        self.step()

[docs]    def get_lr(self):
        if self.current_steps >= self.warmup_steps:
            if self.inverse_sqrt_decay:
                lr_multiplier = self.decay_factor / (self.current_steps ** 0.5)
            else:
                lr_multiplier = 1.0
        else:
            lr_multiplier = self.current_steps / self.warmup_steps
        return [lr_multiplier * base_lr for base_lr in self.base_lrs]


[docs]class PolynomialDecayScheduler(_LRScheduler, BatchScheduler):
    """
    Applies a polynomial decay with lr warmup to the learning rate.

    It is commonly observed that a monotonically decreasing learning rate, whose
    degree of change is carefully chosen, results in a better performing model.

    This scheduler linearly increase learning rate from 0 to final value at the
    beginning of training, determined by warmup_steps.
    Then it applies a polynomial decay function to an optimizer step, given a
    provided `base_lrs` to reach an `end_learning_rate` after `total_steps`.
    """

[docs]    class Config(BatchScheduler.Config):
        #: number of training steps over which to increase learning rate
        warmup_steps: int = 0
        #: number of training steps for learning rate decay
        total_steps: int
        #: end learning rate after `total_steps` of training
        end_learning_rate: float
        #: power used for polynomial decay calculation
        power: float = 1.0

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer: Optimizer):
        return cls(
            optimizer,
            config.warmup_steps,
            config.total_steps,
            config.end_learning_rate,
            config.power,
        )

    def __init__(self, optimizer, warmup_steps, total_steps, end_learning_rate, power):
        assert total_steps > warmup_steps >= 0
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.end_learning_rate = end_learning_rate
        self.power = power
        self.current_steps = 0
        super().__init__(optimizer)

[docs]    def prepare(self, train_iter, total_epochs):
        super().prepare(train_iter, total_epochs)
        self.step_batch()  # initialize learning rate

[docs]    def get_lr(self):
        if self.current_steps <= self.warmup_steps:
            # during warmup the learning rate linearly increases until
            # it reaches base_lr.
            warmup_factor = self.current_steps / self.warmup_steps
            lrs = [warmup_factor * base_lr for base_lr in self.base_lrs]
        elif self.current_steps <= self.total_steps:
            # start polynomial weight decay until it reaches end_learning_rate
            decay_factor = (
                1
                - (self.current_steps - self.warmup_steps)
                / (self.total_steps - self.warmup_steps)
            ) ** self.power

            lrs = [
                (base_lr - self.end_learning_rate) * decay_factor
                + self.end_learning_rate
                for base_lr in self.base_lrs
            ]
        else:
            # reach end_learning_rate after total_steps
            lrs = [self.end_learning_rate for _ in self.base_lrs]

        return lrs

[docs]    def step_batch(self):
        self.current_steps += 1
        # update optimizer.param_groups's learning rate
        self.step()


[docs]class SchedulerWithWarmup(_LRScheduler, BatchScheduler):
    """
    Wraps another scheduler with a warmup phase. After `warmup_steps` defined in
    warmup_scheduler.warmup_steps, the scheduler will switch to use the specified
    scheduler in `scheduler`.

    `warmup_scheduler`: is the configuration for the WarmupScheduler, that warms up
    learning rate over `warmup_steps` linearly.

    `scheduler`: is the main scheduler that will be applied after the warmup phase
    (once `warmup_steps` have passed)
    """

[docs]    class Config(BatchScheduler.Config):
        # the definition of the warmup scheduler for the warmup phase
        warmup_scheduler: WarmupScheduler.Config = WarmupScheduler.Config()

        # the definition of the main scheduler to apply once the warmup phase
        # has passed
        scheduler: Union[
            ExponentialLR.Config,
            CosineAnnealingLR.Config,
            ReduceLROnPlateau.Config,
            LmFineTuning.Config,
            CyclicLR.Config,
        ]

[docs]    @classmethod
    def from_config(cls, config: Config, optimizer: Optimizer):
        warmup_scheduler = create_scheduler(config.warmup_scheduler, optimizer)
        scheduler = create_scheduler(config.scheduler, optimizer)
        return cls(
            optimizer, warmup_scheduler, scheduler, config.warmup_scheduler.warmup_steps
        )

[docs]    def prepare(self, train_iter, total_epochs):
        super().prepare(train_iter, total_epochs)
        self.warmup_scheduler.prepare(train_iter, total_epochs)
        self.scheduler.prepare(train_iter, total_epochs)

    def __init__(self, optimizer, warmup_scheduler, scheduler, switch_steps):
        self.optimizer = optimizer
        self.warmup_scheduler = warmup_scheduler
        self.scheduler = scheduler
        self.switch_steps = switch_steps
        self.curr_steps = 0

[docs]    def step_batch(self):
        if self.curr_steps < self.switch_steps:
            self.curr_steps += 1
            return self.warmup_scheduler.step_batch()
        else:
            return self.scheduler.step_batch()

[docs]    def step_epoch(self, metrics, epoch):
        if self.curr_steps < self.switch_steps:
            return self.warmup_scheduler.step_epoch(metrics=metrics, epoch=epoch)
        else:
            return self.scheduler.step_epoch(metrics=metrics, epoch=None)

[docs]    def get_lr(self):
        if self.curr_steps < self.switch_steps:
            return self.warmup_scheduler.get_lr()
        else:
            return self.scheduler.get_lr()