Source code for pytext.data.token_tensorizer

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import List, Optional, Tuple

import torch
from pytext.config.component import ComponentType, create_component
from pytext.data.tensorizers import TensorizerScriptImpl
from pytext.data.tokenizers import Tokenizer
from pytext.torchscript.tensorizer.tensorizer import VocabLookup
from pytext.torchscript.tokenizer import ScriptDoNothingTokenizer
from pytext.torchscript.utils import ScriptBatchInput, pad_2d
from pytext.torchscript.vocab import ScriptVocabulary
from pytext.utils import cuda
from pytext.utils.file_io import PathManager
from pytext.utils.lazy import lazy_property

from .tensorizers import Tensorizer, VocabConfig, tokenize
from .utils import VocabBuilder, Vocabulary


[docs]class TokenTensorizerScriptImpl(TensorizerScriptImpl):
    def __init__(
        self,
        add_bos_token: bool,
        add_eos_token: bool,
        use_eos_token_for_bos: bool,
        max_seq_len: int,
        vocab: Vocabulary,
        tokenizer: Optional[Tokenizer],
    ):
        super().__init__()

        if tokenizer is not None and hasattr(tokenizer, "torchscriptify"):
            try:
                self.tokenizer = tokenizer.torchscriptify()
            except NotImplementedError:
                # This is fine as long as the exported tokenizer is only used
                # in pre-tokenized mode
                self.tokenizer = None
        else:
            self.tokenizer = None

        self.do_nothing_tokenizer = ScriptDoNothingTokenizer()
        self.vocab = ScriptVocabulary(
            list(vocab),
            pad_idx=vocab.get_pad_index(),
            bos_idx=vocab.get_bos_index() if add_bos_token else -1,
            eos_idx=vocab.get_eos_index() if add_eos_token else -1,
        )
        self.vocab_lookup_1d = VocabLookup(self.vocab)

        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        self.max_seq_len = max_seq_len

[docs]    def get_texts_by_index(
        self, texts: Optional[List[List[str]]], index: int
    ) -> Optional[str]:
        if texts is None or len(texts) == 0:
            return None

        # TokenTensorizer only works with a single text per row, stick with that
        return texts[index][0]

[docs]    def get_tokens_by_index(
        self, tokens: Optional[List[List[List[str]]]], index: int
    ) -> Optional[List[str]]:
        if tokens is None or len(tokens) == 0:
            return None

        # TokenTensorizer only works with a single text per row, stick with that
        return tokens[index][0]

    def _lookup_tokens_1d(
        self, tokens: List[Tuple[str, int, int]]
    ) -> Tuple[List[int], List[int], List[int]]:
        return self.vocab_lookup_1d(
            tokens,
            bos_idx=self.vocab.bos_idx if self.add_bos_token else None,
            eos_idx=self.vocab.eos_idx if self.add_eos_token else None,
            use_eos_token_for_bos=self.use_eos_token_for_bos,
            max_seq_len=self.max_seq_len,
        )

[docs]    def tokenize(
        self, row_text: Optional[str], row_pre_tokenized: Optional[List[str]]
    ) -> List[Tuple[str, int, int]]:

        tokens: List[Tuple[str, int, int]] = []
        if row_text is not None:
            if self.tokenizer is not None:
                tokens = self.tokenizer.tokenize(row_text)
        elif row_pre_tokenized is not None:
            for token in row_pre_tokenized:
                tokens.extend(self.do_nothing_tokenizer.tokenize(token))

        return tokens

[docs]    def numberize(
        self, text_tokens: List[Tuple[str, int, int]]
    ) -> Tuple[List[int], int, List[Tuple[int, int]]]:
        token_indices: List[int] = []
        token_starts: List[int] = []
        token_ends: List[int] = []

        token_indices, token_starts, token_ends = self._lookup_tokens_1d(text_tokens)

        token_ranges: List[Tuple[int, int]] = []

        for s, e in zip(token_starts, token_ends):
            token_ranges.append((s, e))

        return token_indices, len(token_indices), token_ranges

[docs]    def tensorize(
        self,
        tokens_2d: List[List[int]],
        seq_lens_1d: List[int],
        positions_2d: List[List[Tuple[int, int]]],
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

        token_indices_tensor: torch.Tensor = torch.tensor(
            pad_2d(tokens_2d, seq_lens=seq_lens_1d, pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )

        token_starts_2d: List[List[int]] = []
        token_ends_2d: List[List[int]] = []

        for position_list in positions_2d:
            token_starts_2d.append([x[0] for x in position_list])
            token_ends_2d.append([x[1] for x in position_list])

        token_positions_tensor = torch.stack(
            [
                torch.tensor(
                    pad_2d(token_starts_2d, seq_lens=seq_lens_1d, pad_idx=-1),
                    dtype=torch.long,
                ),
                torch.tensor(
                    pad_2d(token_ends_2d, seq_lens=seq_lens_1d, pad_idx=-1),
                    dtype=torch.long,
                ),
            ],
            dim=2,
        )

        return (
            token_indices_tensor,
            torch.tensor(seq_lens_1d, dtype=torch.long),
            token_positions_tensor,
        )

[docs]    def forward(
        self, inputs: ScriptBatchInput
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

        tokens_2d: List[List[int]] = []
        seq_lens_1d: List[int] = []
        positions_2d: List[List[Tuple[int, int]]] = []

        for idx in range(self.batch_size(inputs)):
            tokens: List[Tuple[str, int, int]] = self.tokenize(
                self.get_texts_by_index(inputs.texts, idx),
                self.get_tokens_by_index(inputs.tokens, idx),
            )

            numberized: Tuple[List[int], int, List[Tuple[int, int]]] = self.numberize(
                tokens
            )
            tokens_2d.append(numberized[0])
            seq_lens_1d.append(numberized[1])
            positions_2d.append(numberized[2])

        return self.tensorize(tokens_2d, seq_lens_1d, positions_2d)


[docs]class ScriptBasedTokenTensorizer(Tensorizer):
    """
    An Implementation of TokenTensorizer that uses a TorchScript module in the
    background and is hence torchscriptifiable.

    Note that unlike the original TokenTensorizer, this version cannot deal with
    arbitrarily nested lists of tokens.
    """

    __TENSORIZER_SCRIPT_IMPL__ = TokenTensorizerScriptImpl

    class Config(Tensorizer.Config):
        #: The name of the text column to parse from the data source.
        column: str = "text"
        #: The tokenizer to use to split input text into tokens.
        tokenizer: Tokenizer.Config = Tokenizer.Config()
        add_bos_token: bool = False
        add_eos_token: bool = False
        use_eos_token_for_bos: bool = False
        max_seq_len: Optional[int] = None
        vocab: VocabConfig = VocabConfig()
        vocab_file_delimiter: str = " "

[docs]    @classmethod
    def from_config(cls, config: Config):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        return cls(
            text_column=config.column,
            tokenizer=tokenizer,
            add_bos_token=config.add_bos_token,
            add_eos_token=config.add_eos_token,
            use_eos_token_for_bos=config.use_eos_token_for_bos,
            max_seq_len=config.max_seq_len,
            vocab_config=config.vocab,
            vocab_file_delimiter=config.vocab_file_delimiter,
            is_input=config.is_input,
        )

    def __init__(
        self,
        text_column,
        tokenizer=None,
        add_bos_token=Config.add_bos_token,
        add_eos_token=Config.add_eos_token,
        use_eos_token_for_bos=Config.use_eos_token_for_bos,
        max_seq_len=Config.max_seq_len,
        vocab_config=None,
        vocab=None,
        vocab_file_delimiter=" ",
        is_input=Config.is_input,
    ):
        self.text_column = text_column
        self.tokenizer = tokenizer or Tokenizer()
        self.vocab = vocab
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        self.max_seq_len = max_seq_len or 2 ** 30  # large number
        self.vocab_builder = None
        self.vocab_config = vocab_config or VocabConfig()
        self.vocab_file_delimiter = vocab_file_delimiter
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.text_column, str)]

    def _reverse_lookup(self, token_ids):
        return [self.vocab[id] for id in token_ids]

[docs]    def initialize(self, vocab_builder=None, from_scratch=True):
        """Build vocabulary based on training corpus."""
        if self.vocab and from_scratch:
            if self.vocab_config.build_from_data or self.vocab_config.vocab_files:
                print(
                    f"`{self.text_column}` column: vocab already provided, skipping "
                    f"adding tokens from data and from vocab files."
                )
            return

        if not self.vocab_config.build_from_data and not self.vocab_config.vocab_files:
            raise ValueError(
                f"To create token tensorizer for '{self.text_column}', either "
                f"`build_from_data` or `vocab_files` must be set."
            )
        if not self.vocab_builder:
            # else means not initialize from scratch, self.vocab_builder
            # would be set already
            self.vocab_builder = vocab_builder or VocabBuilder(
                delimiter=self.vocab_file_delimiter
            )
            self.vocab_builder.use_bos = self.add_bos_token
            self.vocab_builder.use_eos = self.add_eos_token
        if not self.vocab_config.build_from_data:
            self._add_vocab_from_files()
            self.vocab = self.vocab_builder.make_vocab()
            return

        try:
            while True:
                row = yield
                raw_text = row[self.text_column]
                tokenized = self.tokenizer.tokenize(raw_text)
                self.vocab_builder.add_all([t.value for t in tokenized])
        except GeneratorExit:
            self.vocab_builder.truncate_to_vocab_size(
                self.vocab_config.size_from_data, self.vocab_config.min_counts
            )
            self._add_vocab_from_files()
            self.vocab = self.vocab_builder.make_vocab()

    def _add_vocab_from_files(self):
        for vocab_file in self.vocab_config.vocab_files:
            with PathManager.open(vocab_file.filepath) as f:
                self.vocab_builder.add_from_file(
                    f,
                    vocab_file.skip_header_line,
                    vocab_file.lowercase_tokens,
                    vocab_file.size_limit,
                )

    def _tokenize(self, text=None, pre_tokenized=None, add_eos_bos=True):

        add_bos = self.add_bos_token and add_eos_bos
        add_eos = self.add_eos_token and add_eos_bos

        return tokenize(
            text=text,
            pre_tokenized=pre_tokenized,
            tokenizer=self.tokenizer,
            bos_token=self.vocab.bos_token if add_bos else None,
            eos_token=self.vocab.eos_token if add_eos else None,
            pad_token=self.vocab.pad_token,
            use_eos_token_for_bos=self.use_eos_token_for_bos,
            max_seq_len=self.max_seq_len,
        )

    @lazy_property
    def tensorizer_script_impl(self):
        return self.__TENSORIZER_SCRIPT_IMPL__(
            add_bos_token=self.add_bos_token,
            add_eos_token=self.add_eos_token,
            use_eos_token_for_bos=self.use_eos_token_for_bos,
            max_seq_len=self.max_seq_len,
            vocab=self.vocab,
            tokenizer=self.tokenizer,
        )

[docs]    def numberize(self, row):
        """
        Tokenize and look up in vocabulary.

        A few notable things:

        1) We're using the non-torchsciptified tokenizer here.
        This allows us to use non-torchscriptifiable tokenizers if we don't intend
        to torchscriptify this module.

        2) When using the ScriptImpl to do the lookup, it takes care of the
        BOS / EOS stuff there. Hence we don't need to do that with the tokenizer.

        3) The tokenize function from tensorizer.py returns a tuple of
        (tokens, start_indices, end_indices), while the ScriptImpl expects a
        list of (token, start_idx, end_idx) tuples so we need to unzip these

        """
        return self.tensorizer_script_impl.numberize(
            list(zip(*self._tokenize(text=row[self.text_column], add_eos_bos=False)))
        )

[docs]    def prepare_input(self, row):
        """
        Tokenize, look up in vocabulary, return tokenized_texts in raw text

        Similarly to the above function, tokenization is done with the original
        and not the torchscriptified tokenizer.
        """
        tokenized_texts, start_idx, end_idx = self._tokenize(row[self.text_column])
        token_ranges = list(zip(start_idx, end_idx))
        return list(tokenized_texts), len(tokenized_texts), token_ranges

[docs]    def tensorize(self, batch):
        (
            token_indices_tensor,
            seq_lens_1d,
            token_positions_tensor,
        ) = self.tensorizer_script_impl.tensorize_wrapper(*zip(*batch))

        # Need to map them to cuda tensors so that we can run this on GPU
        return (
            cuda.tensor(token_indices_tensor, dtype=torch.long),
            cuda.tensor(seq_lens_1d, dtype=torch.long),
            cuda.tensor(token_positions_tensor, dtype=torch.long),
        )

[docs]    def sort_key(self, row):
        # use seq_len as sort key
        return row[1]