Source code for pytext.models.roberta

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import Dict, List, Tuple

import torch
from pytext.common.constants import Stage
from pytext.config import ConfigBase
from pytext.data.roberta_tensorizer import (
    RoBERTaTensorizer,
    RoBERTaTokenLevelTensorizer,
)
from pytext.data.tensorizers import LabelTensorizer, Tensorizer
from pytext.models.bert_classification_models import NewBertModel
from pytext.models.decoders.mlp_decoder import MLPDecoder
from pytext.models.model import BaseModel
from pytext.models.module import Module, create_module
from pytext.models.output_layers import WordTaggingOutputLayer
from pytext.models.representations.transformer import (
    MultiheadSelfAttention,
    SentenceEncoder,
    Transformer,
    TransformerLayer,
)
from pytext.models.representations.transformer_sentence_encoder_base import (
    TransformerSentenceEncoderBase,
)
from pytext.torchscript.module import get_script_module_cls
from pytext.utils.file_io import PathManager
from torch.serialization import default_restore_location


[docs]def init_params(module):
    """Initialize the RoBERTa weights for pre-training from scratch."""

    if isinstance(module, torch.nn.Linear):
        module.weight.data.normal_(mean=0.0, std=0.02)
        if module.bias is not None:
            module.bias.data.zero_()
    if isinstance(module, torch.nn.Embedding):
        module.weight.data.normal_(mean=0.0, std=0.02)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()


[docs]class RoBERTaEncoderBase(TransformerSentenceEncoderBase):
    __EXPANSIBLE__ = True

[docs]    class Config(TransformerSentenceEncoderBase.Config):
        pass

    def _encoder(self, inputs):
        # NewBertModel expects the output as a tuple and grabs the first element
        tokens, _, _, _ = inputs
        full_representation = self.encoder(tokens)
        sentence_rep = full_representation[:, 0, :]
        return [full_representation], sentence_rep


[docs]class RoBERTaEncoderJit(RoBERTaEncoderBase):
    """A TorchScript RoBERTa implementation"""

[docs]    class Config(RoBERTaEncoderBase.Config):
        pretrained_encoder: Module.Config = Module.Config(
            load_path=(
                "manifold://pytext_training/tree/static/models/roberta_public.pt1"
            )
        )

    def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)
        assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        self.encoder = create_module(config.pretrained_encoder)
        self.representation_dim = self.encoder.encoder.token_embedding.weight.size(-1)

    def _embedding(self):
        # used to tie weights in MaskedLM model
        return self.encoder.encoder.token_embedding


[docs]class RoBERTaEncoder(RoBERTaEncoderBase):
    """A PyTorch RoBERTa implementation"""

[docs]    class Config(RoBERTaEncoderBase.Config):
        embedding_dim: int = 768
        vocab_size: int = 50265
        num_encoder_layers: int = 12
        num_attention_heads: int = 12
        model_path: str = (
            "manifold://pytext_training/tree/static/models/roberta_base_torch.pt"
        )
        # Loading the state dict of the model depends on whether the model was
        # previously finetuned in PyText or not. If it was finetuned then we
        # dont need to translate the state dict and can just load it`
        # directly.
        is_finetuned: bool = False

    def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        self.encoder = SentenceEncoder(
            transformer=Transformer(
                vocab_size=config.vocab_size,
                embedding_dim=config.embedding_dim,
                layers=[
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=MultiheadSelfAttention(
                            config.embedding_dim, config.num_attention_heads
                        ),
                    )
                    for _ in range(config.num_encoder_layers)
                ],
            )
        )
        self.apply(init_params)
        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(
                    f, map_location=lambda s, l: default_restore_location(s, "cpu")
                )
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            else:
                self.load_state_dict(roberta_state)

        self.representation_dim = self._embedding().weight.size(-1)

    def _embedding(self):
        # used to tie weights in MaskedLM model
        return self.encoder.transformer.token_embedding


[docs]class RoBERTa(NewBertModel):
[docs]    class Config(NewBertModel.Config):
        class InputConfig(ConfigBase):
            tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
            labels: LabelTensorizer.Config = LabelTensorizer.Config()

        inputs: InputConfig = InputConfig()
        encoder: RoBERTaEncoderBase.Config = RoBERTaEncoderJit.Config()

[docs]    def torchscriptify(self, tensorizers, traced_model):
        """Using the traced model, create a ScriptModule which has a nicer API that
        includes generating tensors from simple data types, and returns classified
        values according to the output layer (eg. as a dict mapping class name to score)
        """
        script_tensorizer = tensorizers["tokens"].torchscriptify()
        script_module_cls = get_script_module_cls(
            script_tensorizer.tokenizer.input_type()
        )

        return script_module_cls(
            model=traced_model,
            output_layer=self.output_layer.torchscript_predictions(),
            tensorizer=script_tensorizer,
        )


[docs]class RoBERTaWordTaggingModel(BaseModel):
    """
    Single Sentence Token-level Classification Model using XLM.
    """

[docs]    class Config(BaseModel.Config):
        class WordTaggingInputConfig(ConfigBase):
            tokens: RoBERTaTokenLevelTensorizer.Config = (
                RoBERTaTokenLevelTensorizer.Config()
            )

        inputs: WordTaggingInputConfig = WordTaggingInputConfig()
        encoder: RoBERTaEncoderBase.Config = RoBERTaEncoderJit.Config()
        decoder: MLPDecoder.Config = MLPDecoder.Config()
        output_layer: WordTaggingOutputLayer.Config = WordTaggingOutputLayer.Config()

[docs]    @classmethod
    def from_config(cls, config: Config, tensorizers: Dict[str, Tensorizer]):
        label_vocab = tensorizers["tokens"].labels_vocab
        vocab = tensorizers["tokens"].vocab

        encoder = create_module(
            config.encoder,
            output_encoded_layers=True,
            padding_idx=vocab.get_pad_index(),
            vocab_size=vocab.__len__(),
        )
        decoder = create_module(
            config.decoder, in_dim=encoder.representation_dim, out_dim=len(label_vocab)
        )
        output_layer = create_module(config.output_layer, labels=label_vocab)
        return cls(encoder, decoder, output_layer)

    def __init__(self, encoder, decoder, output_layer, stage=Stage.TRAIN) -> None:
        super().__init__(stage=stage)
        self.encoder = encoder
        self.decoder = decoder
        self.module_list = [encoder, decoder]
        self.output_layer = output_layer
        self.stage = stage

[docs]    def arrange_model_inputs(self, tensor_dict):
        tokens, pad_mask, segment_labels, positions, _ = tensor_dict["tokens"]
        model_inputs = (tokens, pad_mask, segment_labels, positions)
        return (model_inputs,)

[docs]    def arrange_targets(self, tensor_dict):
        _, _, _, _, labels = tensor_dict["tokens"]
        return labels

[docs]    def forward(self, encoder_inputs: Tuple[torch.Tensor, ...], *args) -> torch.Tensor:
        # The encoder outputs a list of representations for each token where
        # every element of the list corresponds to a layer in the transformer.
        # We extract and pass the representations associated with the last layer
        # of the transformer.
        representation = self.encoder(encoder_inputs)[0][-1]
        return self.decoder(representation, *args)