Source code for pytext.data.bert_tensorizer

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import Any, Dict, List, Optional, Tuple

import torch
from fairseq.data.dictionary import Dictionary
from fairseq.data.legacy.masked_lm_dictionary import BertDictionary
from pytext import resources
from pytext.common.constants import Token
from pytext.config.component import ComponentType, create_component
from pytext.data.tensorizers import Tensorizer, TensorizerScriptImpl
from pytext.data.tokenizers import Tokenizer, WordPieceTokenizer
from pytext.data.utils import BOS, EOS, MASK, PAD, UNK, Vocabulary
from pytext.torchscript.tensorizer.tensorizer import VocabLookup
from pytext.torchscript.utils import ScriptBatchInput, pad_2d, pad_2d_mask
from pytext.torchscript.vocab import ScriptVocabulary
from pytext.utils.file_io import PathManager
from pytext.utils.lazy import lazy_property


[docs]def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, Token] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
) -> Vocabulary:
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    dictionary = dictionary_class.load(vocab_file)
    # finalize will sort the dict based on frequency so only do this if
    # a min_count or max_vocab size is specified
    if min_count > 0 or max_vocab > 0:
        dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1)
    if tokens_to_add:
        for token in tokens_to_add:
            dictionary.add_symbol(token)
    return Vocabulary(
        dictionary.symbols, dictionary.count, replacements=special_token_replacements
    )


[docs]class BERTTensorizerBaseScriptImpl(TensorizerScriptImpl):
    def __init__(self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int):
        super().__init__()
        self.tokenizer = tokenizer
        self.vocab = ScriptVocabulary(
            list(vocab),
            pad_idx=vocab.get_pad_index(),
            bos_idx=vocab.get_bos_index(-1),
            eos_idx=vocab.get_eos_index(-1),
            unk_idx=vocab.get_unk_index(),
        )
        self.vocab_lookup = VocabLookup(self.vocab)
        self.max_seq_len = max_seq_len

    def _lookup_tokens(
        self, tokens: List[Tuple[str, int, int]], max_seq_len: Optional[int] = None
    ) -> Tuple[List[int], List[int], List[int]]:
        """
        This function knows how to call lookup_tokens with the correct
        settings for this model. The default behavior is to wrap the
        numberized text with distinct BOS and EOS tokens. The resulting
        vector would look something like this:
        [BOS, token1_id, . . . tokenN_id, EOS]

        The function also takes an optional seq_len parameter which is
        used to customize truncation in case we have multiple text fields.
        By default max_seq_len is used. It's upto the numberize function of
        the class to decide how to use the seq_len param.

        For example:
        - In the case of sentence pair classification, we might want both
        pieces of text have the same length which is half of the
        max_seq_len supported by the model.
        - In the case of QA, we might want to truncate the context by a
        seq_len which is longer than what we use for the question.

        Args:
            tokens: a list of tokens represent a sentence, each token represented
            by token string, start and end indices.

        Returns:
            tokens_ids: List[int], a list of token ids represent a sentence.
            start_indices: List[int], each token start indice in the sentence.
            end_indices: List[int], each token end indice in the sentence.
        """
        if max_seq_len is None:
            max_seq_len = self.max_seq_len

        return self.vocab_lookup(
            tokens,
            bos_idx=self.vocab.bos_idx,
            eos_idx=self.vocab.eos_idx,
            use_eos_token_for_bos=False,
            max_seq_len=max_seq_len,
        )

    def _wrap_numberized_tokens(
        self, numberized_tokens: List[int], idx: int
    ) -> List[int]:
        """
        If a class has a non-standard way of generating the final numberized text
        (eg: BERT) then a class specific version of wrap_numberized_text function
        should be implemented. This allows us to share the numberize
        function across classes without having to copy paste code. The default
        implementation doesnt do anything.
        """
        return numberized_tokens

[docs]    def numberize(
        self, per_sentence_tokens: List[List[Tuple[str, int, int]]]
    ) -> Tuple[List[int], List[int], int, List[int]]:
        """
        This function contains logic for converting tokens into ids based on
        the specified vocab. It also outputs, for each instance, the vectors
        needed to run the actual model.

        Args:
            per_sentence_tokens: list of tokens per sentence level in one row,
            each token represented by token string, start and end indices.

        Returns:
            tokens: List[int], a list of token ids, concatenate all
            sentences token ids.
            segment_labels: List[int], denotes each token belong to
            which sentence.
            seq_len: int, tokens length
            positions: List[int], token positions
        """
        tokens: List[int] = []
        segment_labels: List[int] = []
        seq_len: int = 0
        positions: List[int] = []

        for idx, single_sentence_tokens in enumerate(per_sentence_tokens):
            lookup_ids: List[int] = self._lookup_tokens(single_sentence_tokens)[0]
            lookup_ids = self._wrap_numberized_tokens(lookup_ids, idx)

            tokens.extend(lookup_ids)
            segment_labels.extend([idx] * len(lookup_ids))

        seq_len = len(tokens)
        positions = [i for i in range(seq_len)]
        return tokens, segment_labels, seq_len, positions

[docs]    def tensorize(
        self,
        tokens_2d: List[List[int]],
        segment_labels_2d: List[List[int]],
        seq_lens_1d: List[int],
        positions_2d: List[List[int]],
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Convert instance level vectors into batch level tensors.
        """
        tokens, pad_mask = pad_2d_mask(
            tokens_2d,
            pad_value=self.vocab.pad_idx,
            seq_padding_control=self.seq_padding_control,
            max_seq_pad_len=self.max_seq_len,
            batch_padding_control=self.batch_padding_control,
        )
        segment_labels = torch.tensor(
            pad_2d(
                segment_labels_2d,
                seq_lens=seq_lens_1d,
                pad_idx=0,
                max_len=self.max_seq_len,
            ),
            dtype=torch.long,
        )
        positions = torch.tensor(
            pad_2d(
                positions_2d, seq_lens=seq_lens_1d, pad_idx=0, max_len=self.max_seq_len
            ),
            dtype=torch.long,
        )
        if self.device == "":
            return tokens, pad_mask, segment_labels, positions
        else:
            return (
                tokens.to(self.device),
                pad_mask.to(self.device),
                segment_labels.to(self.device),
                positions.to(self.device),
            )

[docs]    def tokenize(
        self,
        row_text: Optional[List[str]],
        row_pre_tokenized: Optional[List[List[str]]],
    ) -> List[List[Tuple[str, int, int]]]:
        """
        This function convert raw inputs into tokens, each token is represented
        by token(str), start and end indices in the raw inputs. There are two
        possible inputs to this function depends if the tokenized in implemented
        in TorchScript or not.

        Case 1: Tokenizer has a full TorchScript implementation, the input will
        be a list of sentences (in most case it is single sentence or a pair).

        Case 2: Tokenizer have partial or no TorchScript implementation, in most
        case, the tokenizer will be host in Yoda, the input will be a list of
        pre-processed tokens.

        Returns:
            per_sentence_tokens: tokens per sentence level, each token is
            represented by token(str), start and end indices.
        """
        per_sentence_tokens: List[List[Tuple[str, int, int]]] = []

        if row_text is not None:
            for text in row_text:
                per_sentence_tokens.append(self.tokenizer.tokenize(text))
        elif row_pre_tokenized is not None:
            for sentence_pre_tokenized in row_pre_tokenized:
                sentence_tokens: List[Tuple[str, int, int]] = []
                for token in sentence_pre_tokenized:
                    sentence_tokens.extend(self.tokenizer.tokenize(token))
                per_sentence_tokens.append(sentence_tokens)

        return per_sentence_tokens

[docs]    def forward(
        self, inputs: ScriptBatchInput
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Wire up tokenize(), numberize() and tensorize() functions for data
        processing.
        When export to TorchScript, the wrapper module should choose to use
        texts or pre_tokenized based on the TorchScript tokenizer
        implementation (e.g use external tokenizer such as Yoda or not).
        """
        tokens_2d: List[List[int]] = []
        segment_labels_2d: List[List[int]] = []
        seq_lens_1d: List[int] = []
        positions_2d: List[List[int]] = []

        for idx in range(self.batch_size(inputs)):
            tokens: List[List[Tuple[str, int, int]]] = self.tokenize(
                self.get_texts_by_index(inputs.texts, idx),
                self.get_tokens_by_index(inputs.tokens, idx),
            )

            numberized: Tuple[List[int], List[int], int, List[int]] = self.numberize(
                tokens
            )
            tokens_2d.append(numberized[0])
            segment_labels_2d.append(numberized[1])
            seq_lens_1d.append(numberized[2])
            positions_2d.append(numberized[3])

        return self.tensorize(tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d)

[docs]    def torchscriptify(self):
        # tokenizer will only be used in Inference, so we hold its torchscriptify
        # by end of the training.
        if not isinstance(self.tokenizer, torch.jit.ScriptModule):
            self.tokenizer = self.tokenizer.torchscriptify()
        return super().torchscriptify()


[docs]class BERTTensorizerBase(Tensorizer):
    """
    Base Tensorizer class for all BERT style models including XLM,
    RoBERTa and XLM-R.
    """

    __EXPANSIBLE__ = True

[docs]    class Config(Tensorizer.Config):
        # BERT style models support multiple text inputs
        columns: List[str] = ["text"]
        tokenizer: Tokenizer.Config = Tokenizer.Config()
        # base token-level tokenizer for sequence labeling tasks
        base_tokenizer: Optional[Tokenizer.Config] = None
        vocab_file: str = ""
        max_seq_len: int = 256

    def __init__(
        self,
        columns: List[str] = Config.columns,
        vocab: Vocabulary = None,
        tokenizer: Tokenizer = None,
        max_seq_len: int = Config.max_seq_len,
        base_tokenizer: Tokenizer = None,
    ) -> None:
        super().__init__()
        self.columns = columns
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.base_tokenizer = base_tokenizer
        self.max_seq_len = max_seq_len
        # Needed to ensure that we're not masking special tokens. By default
        # we use the BOS token from the vocab. If a class has different
        # behavior (eg: XLM), it needs to override this.
        self.bos_token = self.vocab.bos_token

    @property
    def column_schema(self):
        return [(column, str) for column in self.columns]

    @lazy_property
    def tensorizer_script_impl(self):
        return self.__TENSORIZER_SCRIPT_IMPL__(
            tokenizer=self.tokenizer, vocab=self.vocab, max_seq_len=self.max_seq_len
        )

[docs]    def numberize(self, row: Dict) -> Tuple[Any, ...]:
        """
        This function contains logic for converting tokens into ids based on
        the specified vocab. It also outputs, for each instance, the vectors
        needed to run the actual model.
        """
        per_sentence_tokens = [
            self.tokenizer.tokenize(row[column]) for column in self.columns
        ]
        return self.tensorizer_script_impl.numberize(per_sentence_tokens)

[docs]    def tensorize(self, batch) -> Tuple[torch.Tensor, ...]:
        """
        Convert instance level vectors into batch level tensors.
        """
        return self.tensorizer_script_impl.tensorize_wrapper(*zip(*batch))

[docs]    def initialize(self, vocab_builder=None, from_scratch=True):
        # vocab for BERT is already set
        return
        # we need yield here to make this function a generator
        yield

[docs]    def sort_key(self, row):
        return row[2]


[docs]class BERTTensorizerScriptImpl(BERTTensorizerBaseScriptImpl):
    def _lookup_tokens(
        self, tokens: List[Tuple[str, int, int]], max_seq_len: Optional[int] = None
    ) -> Tuple[List[int], List[int], List[int]]:
        if max_seq_len is None:
            max_seq_len = self.max_seq_len
        max_seq_len -= 1  # because _wrap_numberized_tokens adds a token

        return self.vocab_lookup(
            tokens,
            bos_idx=None,
            eos_idx=self.vocab.eos_idx,
            use_eos_token_for_bos=False,
            max_seq_len=max_seq_len,
        )

    def _wrap_numberized_tokens(
        self, numberized_tokens: List[int], idx: int
    ) -> List[int]:
        if idx == 0:
            numberized_tokens = [self.vocab.bos_idx] + numberized_tokens
        return numberized_tokens


[docs]class BERTTensorizer(BERTTensorizerBase):
    """
    Tensorizer for BERT tasks.  Works for single sentence, sentence pair, triples etc.
    """

    __EXPANSIBLE__ = True
    __TENSORIZER_SCRIPT_IMPL__ = BERTTensorizerScriptImpl

[docs]    class Config(BERTTensorizerBase.Config):
        tokenizer: Tokenizer.Config = WordPieceTokenizer.Config()
        vocab_file: str = WordPieceTokenizer.Config().wordpiece_vocab_path

[docs]    @classmethod
    def from_config(cls, config: Config, **kwargs):
        """
        from_config parses the config associated with the tensorizer and
        creates both the tokenizer and the Vocabulary object. The extra arguments
        passed as kwargs allow us to reuse thie function with variable number
        of arguments (eg: for classes which derive from this class).
        """
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        special_token_replacements = {
            "[UNK]": UNK,
            "[PAD]": PAD,
            "[CLS]": BOS,
            "[MASK]": MASK,
            "[SEP]": EOS,
        }
        if isinstance(tokenizer, WordPieceTokenizer):
            vocab = Vocabulary(
                [token for token, _ in tokenizer.vocab.items()],
                replacements=special_token_replacements,
            )
        else:
            config.vocab_file = (
                resources.roberta.RESOURCE_MAP[config.vocab_file]
                if config.vocab_file in resources.roberta.RESOURCE_MAP
                else config.vocab_file
            )
            with PathManager.open(config.vocab_file) as file_path:
                vocab = build_fairseq_vocab(
                    dictionary_class=BertDictionary,
                    vocab_file=file_path,
                    special_token_replacements=special_token_replacements,
                )
        return cls(
            columns=config.columns,
            vocab=vocab,
            tokenizer=tokenizer,
            max_seq_len=config.max_seq_len,
            **kwargs,
        )

    def __init__(
        self,
        columns: List[str] = Config.columns,
        vocab: Vocabulary = None,
        tokenizer: Tokenizer = None,
        max_seq_len: int = Config.max_seq_len,
        **kwargs,
    ) -> None:
        super().__init__(
            columns=columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=max_seq_len
        )