Source code for pytext.data.tensorizers

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import contextlib
import copy
import sys
from typing import List, Optional

import torch
from pytext.common import Padding
from pytext.config.component import Component, ComponentType, create_component
from pytext.data.data_structures.annotation import (
    REDUCE,
    SHIFT,
    Annotation,
    is_intent_nonterminal,
    is_slot_nonterminal,
    is_unsupported,
    is_valid_nonterminal,
)
from pytext.data.sources.data_source import Gazetteer
from pytext.data.tokenizers import Token, Tokenizer
from pytext.torchscript.tensorizer import VectorNormalizer
from pytext.utils import cuda, precision
from pytext.utils.data import Slot
from pytext.utils.file_io import PathManager
from pytext.utils.lazy import lazy_property

from .utils import (
    BOL,
    BOS,
    BYTE_BOS,
    BYTE_EOS,
    EOL,
    EOS,
    PAD,
    SpecialToken,
    VocabBuilder,
    Vocabulary,
    align_target_label,
    pad_and_tensorize,
)


[docs]@contextlib.contextmanager
def to_device(tensorizer_script_impl, device):
    cur_device = tensorizer_script_impl.device
    tensorizer_script_impl.device = device
    yield
    tensorizer_script_impl.device = cur_device


[docs]def tokenize(
    text: str = None,
    pre_tokenized: List[Token] = None,
    tokenizer: Tokenizer = None,
    bos_token: Optional[str] = None,
    eos_token: Optional[str] = None,
    pad_token: str = PAD,
    use_eos_token_for_bos: bool = False,
    max_seq_len: int = 2 ** 30,
):
    tokenized = (
        pre_tokenized
        or tokenizer.tokenize(text)[
            : max_seq_len - (bos_token is not None) - (eos_token is not None)
        ]
    )
    if bos_token:
        if use_eos_token_for_bos:
            bos_token = eos_token
        tokenized = [Token(bos_token, -1, -1)] + tokenized
    if eos_token:
        tokenized.append(Token(eos_token, -1, -1))
    if not tokenized:
        tokenized = [Token(pad_token, -1, -1)]

    tokenized_texts, start_idx, end_idx = zip(
        *((t.value, t.start, t.end) for t in tokenized)
    )
    return tokenized_texts, start_idx, end_idx


[docs]def lookup_tokens(
    text: str = None,
    pre_tokenized: List[Token] = None,
    tokenizer: Tokenizer = None,
    vocab: Vocabulary = None,
    bos_token: Optional[str] = None,
    eos_token: Optional[str] = None,
    pad_token: str = PAD,
    use_eos_token_for_bos: bool = False,
    max_seq_len: int = 2 ** 30,
):
    tokenized_texts, start_idx, end_idx = tokenize(
        text,
        pre_tokenized,
        tokenizer,
        bos_token,
        eos_token,
        pad_token,
        use_eos_token_for_bos,
        max_seq_len,
    )
    tokens = vocab.lookup_all(tokenized_texts)
    return tokens, start_idx, end_idx


[docs]class TensorizerScriptImpl(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.device: str = ""

[docs]    @torch.jit.export
    def set_device(self, device: str):
        self.device = device

[docs]    def batch_size(
        self, texts: Optional[List[List[str]]], tokens: Optional[List[List[List[str]]]]
    ) -> int:
        if texts is not None:
            return len(texts)
        elif tokens is not None:
            return len(tokens)
        else:
            raise RuntimeError("Empty input for both texts and tokens.")

[docs]    def row_size(
        self, texts: Optional[List[List[str]]], tokens: Optional[List[List[List[str]]]]
    ) -> int:
        if texts is not None:
            return len(texts[0])
        elif tokens is not None:
            return len(tokens[0])
        else:
            raise RuntimeError("Empty input for both texts and tokens.")

[docs]    def get_texts_by_index(
        self, texts: Optional[List[List[str]]], index: int
    ) -> Optional[List[str]]:
        if texts is None:
            return None
        return texts[index]

[docs]    def get_tokens_by_index(
        self, tokens: Optional[List[List[List[str]]]], index: int
    ) -> Optional[List[List[str]]]:
        if tokens is None:
            return None
        return tokens[index]

[docs]    def tokenize(self, *args, **kwargs):
        """
        This functions will receive the inputs from Clients, usually there are
        two possible inputs
        1) a row of texts: List[str]
        2) a row of pre-processed tokens: List[List[str]]

        Override this function to be TorchScriptable, e.g you need to declare
        concrete input arguments with type hints.
        """
        raise NotImplementedError

[docs]    def numberize(self, *args, **kwargs):
        """
        This functions will receive the outputs from function: tokenize() or
        will be called directly from PyTextTensorizer function: numberize().

        Override this function to be TorchScriptable, e.g you need to declare
        concrete input arguments with type hints.
        """
        raise NotImplementedError

[docs]    def tensorize(self, *args, **kwargs):
        """
        This functions will receive a list(e.g a batch) of outputs
        from function numberize(), padding and convert to output tensors.

        Override this function to be TorchScriptable, e.g you need to declare
        concrete input arguments with type hints.
        """
        raise NotImplementedError

[docs]    @torch.jit.ignore
    def tensorize_wrapper(self, *args, **kwargs):
        """
        This functions will receive a list(e.g a batch) of outputs
        from function numberize(), padding and convert to output tensors.

        It will be called in PyText Tensorizer during training time, this
        function is not torchscriptiable because it depends on cuda.device().
        """
        with to_device(self, cuda.device()):
            return self.tensorize(*args, **kwargs)

[docs]    @torch.jit.ignore
    def torchscriptify(self):
        return torch.jit.script(self)


[docs]class Tensorizer(Component):
    """Tensorizers are a component that converts from batches of
    `pytext.data.type.DataType` instances to tensors. These tensors will eventually
    be inputs to the model, but the model is aware of the tensorizers and can arrange
    the tensors they create to conform to its model.

    Tensorizers have an initialize function. This function allows the tensorizer to
    read through the training dataset to build up any data that it needs for
    creating the model. Commonly this is valuable for things like inferring a
    vocabulary from the training set, or learning the entire set of training labels,
    or slot labels, etc.
    """

    __COMPONENT_TYPE__ = ComponentType.TENSORIZER
    __EXPANSIBLE__ = True
    __TENSORIZER_SCRIPT_IMPL__ = None

[docs]    class Config(Component.Config):
        # Indicate if it can be used to generate input Tensors for prediction
        is_input: bool = True

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(config.is_input)

    def __init__(self, is_input: bool = True):
        self.is_input = is_input

    @property
    def column_schema(self):
        """Generic types don't pickle well pre-3.7, so we don't actually want
        to store the schema as an attribute. We're already storing all of the
        columns anyway, so until there's a better solution, schema is a property."""
        return []

[docs]    def numberize(self, row):
        raise NotImplementedError

[docs]    def prepare_input(self, row):
        """ Return preprocessed input tensors/blob for caffe2 prediction net."""
        return self.numberize(row)

[docs]    def sort_key(self, row):
        raise NotImplementedError

[docs]    def tensorize(self, batch):
        """Tensorizer knows how to pad and tensorize a batch of it's own output."""
        return batch

[docs]    def initialize(self, from_scratch=True):
        """
        The initialize function is carefully designed to allow us to read through the
        training dataset only once, and not store it in memory. As such, it can't itself
        manually iterate over the data source. Instead, the initialize function is a
        coroutine, which is sent row data. This should look roughly like::

            # set up variables here
            ...
            try:
                # start reading through data source
                while True:
                    # row has type Dict[str, types.DataType]
                    row = yield
                    # update any variables, vocabularies, etc.
                    ...
            except GeneratorExit:
                # finalize your initialization, set instance variables, etc.
                ...

        See `WordTokenizer.initialize` for a more concrete example.
        """
        return
        # we need yield here to make this function a generator
        yield

    @lazy_property
    def tensorizer_script_impl(self):
        # Script tensorizer is unpickleable, we use lazy_property for
        # lazy initialization to construct the object during run time.
        raise NotImplementedError

    def __getstate__(self):
        # make a shallow copy of state to avoid side effect on the original object
        state = copy.copy(vars(self))
        state.pop("tensorizer_script_impl", None)
        return state

[docs]    def torchscriptify(self):
        return self.tensorizer_script_impl.torchscriptify()


[docs]class VocabFileConfig(Component.Config):
    #: File containing tokens to add to vocab (first whitespace-separated entry per
    #: line)
    filepath: str = ""
    #: Whether to skip the first line of the file (e.g. if it is a header line)
    skip_header_line: bool = False
    #: Whether to lowercase each of the tokens in the file
    lowercase_tokens: bool = False
    #: The max number of tokens to add to vocab
    size_limit: int = 0


[docs]class VocabConfig(Component.Config):
    #: Whether to add tokens from training data to vocab.
    build_from_data: bool = True
    #: Add `size_from_data` most frequent tokens in training data to vocab (if this
    #: is 0, add all tokens from training data).
    size_from_data: int = 0
    vocab_files: List[VocabFileConfig] = []


[docs]class TokenTensorizer(Tensorizer):
    """Convert text to a list of tokens. Do this based on a tokenizer configuration,
    and build a vocabulary for numberization. Finally, pad the batch to create
    a square tensor of the correct size.
    """

[docs]    class Config(Tensorizer.Config):
        #: The name of the text column to parse from the data source.
        column: str = "text"
        #: The tokenizer to use to split input text into tokens.
        tokenizer: Tokenizer.Config = Tokenizer.Config()
        add_bos_token: bool = False
        add_eos_token: bool = False
        use_eos_token_for_bos: bool = False
        max_seq_len: Optional[int] = None
        vocab: VocabConfig = VocabConfig()
        vocab_file_delimiter: str = " "

[docs]    @classmethod
    def from_config(cls, config: Config):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        return cls(
            text_column=config.column,
            tokenizer=tokenizer,
            add_bos_token=config.add_bos_token,
            add_eos_token=config.add_eos_token,
            use_eos_token_for_bos=config.use_eos_token_for_bos,
            max_seq_len=config.max_seq_len,
            vocab_config=config.vocab,
            vocab_file_delimiter=config.vocab_file_delimiter,
            is_input=config.is_input,
        )

    def __init__(
        self,
        text_column,
        tokenizer=None,
        add_bos_token=Config.add_bos_token,
        add_eos_token=Config.add_eos_token,
        use_eos_token_for_bos=Config.use_eos_token_for_bos,
        max_seq_len=Config.max_seq_len,
        vocab_config=None,
        vocab=None,
        vocab_file_delimiter=" ",
        is_input=Config.is_input,
    ):
        self.text_column = text_column
        self.tokenizer = tokenizer or Tokenizer()
        self.vocab = vocab
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        self.max_seq_len = max_seq_len or 2 ** 30  # large number
        self.vocab_builder = None
        self.vocab_config = vocab_config or VocabConfig()
        self.vocab_file_delimiter = vocab_file_delimiter
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.text_column, str)]

    def _tokenize(self, text=None, pre_tokenized=None):
        return tokenize(
            text=text,
            pre_tokenized=pre_tokenized,
            tokenizer=self.tokenizer,
            bos_token=self.vocab.bos_token if self.add_bos_token else None,
            eos_token=self.vocab.eos_token if self.add_eos_token else None,
            pad_token=self.vocab.pad_token,
            use_eos_token_for_bos=self.use_eos_token_for_bos,
            max_seq_len=self.max_seq_len,
        )

    def _lookup_tokens(self, text=None, pre_tokenized=None):
        return lookup_tokens(
            text=text,
            pre_tokenized=pre_tokenized,
            tokenizer=self.tokenizer,
            vocab=self.vocab,
            bos_token=self.vocab.bos_token if self.add_bos_token else None,
            eos_token=self.vocab.eos_token if self.add_eos_token else None,
            pad_token=self.vocab.pad_token,
            use_eos_token_for_bos=self.use_eos_token_for_bos,
            max_seq_len=self.max_seq_len,
        )

    def _reverse_lookup(self, token_ids):
        return [self.vocab[id] for id in token_ids]

[docs]    def initialize(self, vocab_builder=None, from_scratch=True):
        """Build vocabulary based on training corpus."""
        if self.vocab and from_scratch:
            if self.vocab_config.build_from_data or self.vocab_config.vocab_files:
                print(
                    f"`{self.text_column}` column: vocab already provided, skipping "
                    f"adding tokens from data and from vocab files."
                )
            return

        if not self.vocab_config.build_from_data and not self.vocab_config.vocab_files:
            raise ValueError(
                f"To create token tensorizer for '{self.text_column}', either "
                f"`build_from_data` or `vocab_files` must be set."
            )
        if not self.vocab_builder:
            # else means not initialize from scratch, self.vocab_builder
            # would be set already
            self.vocab_builder = vocab_builder or VocabBuilder(
                delimiter=self.vocab_file_delimiter
            )
            self.vocab_builder.use_bos = self.add_bos_token
            self.vocab_builder.use_eos = self.add_eos_token
        if not self.vocab_config.build_from_data:
            self._add_vocab_from_files()
            self.vocab = self.vocab_builder.make_vocab()
            return

        try:
            while True:
                row = yield
                raw_text = row[self.text_column]
                tokenized = self.tokenizer.tokenize(raw_text)
                self.vocab_builder.add_all([t.value for t in tokenized])
        except GeneratorExit:
            self.vocab_builder.truncate_to_vocab_size(self.vocab_config.size_from_data)
            self._add_vocab_from_files()
            self.vocab = self.vocab_builder.make_vocab()

    def _add_vocab_from_files(self):
        for vocab_file in self.vocab_config.vocab_files:
            with PathManager.open(vocab_file.filepath) as f:
                self.vocab_builder.add_from_file(
                    f,
                    vocab_file.skip_header_line,
                    vocab_file.lowercase_tokens,
                    vocab_file.size_limit,
                )

[docs]    def numberize(self, row):
        """Tokenize, look up in vocabulary."""
        tokens, start_idx, end_idx = self._lookup_tokens(row[self.text_column])
        token_ranges = list(zip(start_idx, end_idx))
        return tokens, len(tokens), token_ranges

[docs]    def prepare_input(self, row):
        """Tokenize, look up in vocabulary, return tokenized_texts in raw text"""
        tokenized_texts, start_idx, end_idx = self._tokenize(row[self.text_column])
        token_ranges = list(zip(start_idx, end_idx))
        return list(tokenized_texts), len(tokenized_texts), token_ranges

[docs]    def tensorize(self, batch):
        tokens, seq_lens, token_ranges = zip(*batch)
        return (
            pad_and_tensorize(tokens, self.vocab.get_pad_index()),
            pad_and_tensorize(seq_lens),
            pad_and_tensorize(token_ranges),
        )

[docs]    def sort_key(self, row):
        # use seq_len as sort key
        return row[1]


[docs]class ByteTensorizer(Tensorizer):
    """Turn characters into sequence of int8 bytes. One character will have one
    or more bytes depending on it's encoding
    """

    UNK_BYTE = 0
    PAD_BYTE = 0
    NUM = 256

[docs]    class Config(Tensorizer.Config):
        #: The name of the text column to parse from the data source.
        column: str = "text"
        lower: bool = True
        max_seq_len: Optional[int] = None
        add_bos_token: Optional[bool] = False
        add_eos_token: Optional[bool] = False
        use_eos_token_for_bos: Optional[bool] = False

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(
            config.column,
            config.lower,
            config.max_seq_len,
            config.add_bos_token,
            config.add_eos_token,
            config.use_eos_token_for_bos,
            config.is_input,
        )

    def __init__(
        self,
        text_column,
        lower=True,
        max_seq_len=None,
        add_bos_token=Config.add_bos_token,
        add_eos_token=Config.add_eos_token,
        use_eos_token_for_bos=Config.use_eos_token_for_bos,
        is_input=Config.is_input,
    ):
        self.text_column = text_column
        self.lower = lower
        self.max_seq_len = max_seq_len
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.text_column, str)]

[docs]    def numberize(self, row):
        """Convert text to characters."""
        text = row[self.text_column].strip()

        if self.lower:
            text = text.lower()

        bytes = list(text.encode())

        if self.max_seq_len:
            bytes = bytes[: self.max_seq_len]
        if self.add_bos_token:
            bos = BYTE_EOS if self.use_eos_token_for_bos else BYTE_BOS
            if bos in text:
                print('Special token "{}" exists in text "{}". Exit.'.format(bos, text))
                sys.exit(1)
            bytes = list(bos.encode()) + bytes
        if self.add_eos_token:
            if BYTE_EOS in text:
                print(
                    'Special token "{}" exists in text "{}". Exit.'.format(
                        BYTE_EOS, text
                    )
                )
                sys.exit(1)
            bytes = bytes + list(BYTE_EOS.encode())
        return bytes, len(bytes)

[docs]    def tensorize(self, batch):
        bytes, bytes_len = zip(*batch)
        return pad_and_tensorize(bytes, self.PAD_BYTE), pad_and_tensorize(bytes_len)

[docs]    def sort_key(self, row):
        # use bytes_len as sort key
        return row[1]


[docs]class ByteTokenTensorizer(Tensorizer):
    """Turn words into 2-dimensional tensors of int8 bytes. Words are padded to
    `max_byte_len`. Also computes sequence lengths (1-D tensor) and token lengths
    (2-D tensor). 0 is the pad byte.
    """

    NUM_BYTES = 256

[docs]    class Config(Tensorizer.Config):
        #: The name of the text column to parse from the data source.
        column: str = "text"
        #: The tokenizer to use to split input text into tokens.
        tokenizer: Tokenizer.Config = Tokenizer.Config()
        #: The max token length for input text.
        max_seq_len: Optional[int] = None
        #: The max byte length for a token.
        max_byte_len: int = 15
        #: Offset to add to all non-padding bytes
        offset_for_non_padding: int = 0
        add_bos_token: bool = False
        add_eos_token: bool = False
        use_eos_token_for_bos: bool = False

[docs]    @classmethod
    def from_config(cls, config: Config):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        return cls(
            text_column=config.column,
            tokenizer=tokenizer,
            max_seq_len=config.max_seq_len,
            max_byte_len=config.max_byte_len,
            offset_for_non_padding=config.offset_for_non_padding,
            add_bos_token=config.add_bos_token,
            add_eos_token=config.add_eos_token,
            use_eos_token_for_bos=config.use_eos_token_for_bos,
            is_input=config.is_input,
        )

    def __init__(
        self,
        text_column,
        tokenizer=None,
        max_seq_len=Config.max_seq_len,
        max_byte_len=Config.max_byte_len,
        offset_for_non_padding=Config.offset_for_non_padding,
        add_bos_token=Config.add_bos_token,
        add_eos_token=Config.add_eos_token,
        use_eos_token_for_bos=Config.use_eos_token_for_bos,
        is_input=Config.is_input,
    ):
        self.text_column = text_column
        self.tokenizer = tokenizer or Tokenizer()
        self.max_seq_len = max_seq_len or 2 ** 30  # large number
        self.max_byte_len = max_byte_len
        self.offset_for_non_padding = offset_for_non_padding
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.text_column, str)]

[docs]    def numberize(self, row):
        """Convert text to bytes, pad batch."""
        tokens = self.tokenizer.tokenize(row[self.text_column])[
            : (self.max_seq_len - self.add_bos_token - self.add_eos_token)
        ]
        if self.add_bos_token:
            bos = EOS if self.use_eos_token_for_bos else BOS
            tokens = [Token(bos, -1, -1)] + tokens
        if self.add_eos_token:
            tokens.append(Token(EOS, -1, -1))

        if not tokens:
            tokens = [Token(PAD, -1, -1)]
        bytes = [self._numberize_token(token)[: self.max_byte_len] for token in tokens]
        token_lengths = len(tokens)
        byte_lengths = [len(token_bytes) for token_bytes in bytes]
        return bytes, token_lengths, byte_lengths

    def _numberize_token(self, token):
        return [c + self.offset_for_non_padding for c in token.value.encode()]

[docs]    def tensorize(self, batch, pad_token=0):
        bytes, token_lengths, byte_lengths = zip(*batch)
        # Set bytes shape because byte length should always be `max_byte_len` no
        # matter how long the bytes in the batch are.
        pad_shape = (
            len(batch),
            precision.pad_length(max(len(l) for l in byte_lengths)),
            self.max_byte_len,
        )
        return (
            pad_and_tensorize(bytes, pad_shape=pad_shape, pad_token=pad_token),
            pad_and_tensorize(token_lengths),
            pad_and_tensorize(byte_lengths),
        )

[docs]    def sort_key(self, row):
        return len(row[0])


[docs]class CharacterTokenTensorizer(TokenTensorizer):
    """Turn words into 2-dimensional tensors of ints based on their ascii values.
    Words are padded to the maximum word length (also capped at `max_char_length`).
    Sequence lengths are the length of each token, 0 for pad token.
    """

[docs]    class Config(TokenTensorizer.Config):
        #: The max character length for a token.
        max_char_length: int = 20

    def __init__(self, max_char_length: int = Config.max_char_length, **kwargs):
        self.max_char_length = max_char_length
        super().__init__(**kwargs)

    # Don't need to create a vocab
    initialize = Tensorizer.initialize

[docs]    def numberize(self, row):
        """Convert text to characters, pad batch."""
        tokens = self.tokenizer.tokenize(row[self.text_column])[: self.max_seq_len]
        characters = [
            self._numberize_token(token)[: self.max_char_length] for token in tokens
        ]
        token_lengths = len(tokens)
        char_lengths = [len(token_chars) for token_chars in characters]
        return characters, token_lengths, char_lengths

    def _numberize_token(self, token):
        return [ord(c) for c in token.value]

[docs]    def tensorize(self, batch):
        characters, token_lengths, char_lengths = zip(*batch)
        return (
            pad_and_tensorize(characters),
            pad_and_tensorize(token_lengths),
            pad_and_tensorize(char_lengths),
        )

[docs]    def sort_key(self, row):
        return len(row[0])


[docs]class LabelTensorizer(Tensorizer):
    """Numberize labels. Label can be used as either input or target """

    __EXPANSIBLE__ = True

[docs]    class Config(Tensorizer.Config):
        #: The name of the label column to parse from the data source.
        column: str = "label"
        #: Whether to allow for unknown labels at test/prediction time
        allow_unknown: bool = False
        #: if vocab should have pad, usually false when label is used as target
        pad_in_vocab: bool = False
        #: The label values, if known. Will skip initialization step if provided.
        label_vocab: Optional[List[str]] = None
        # Indicate if it can be used to generate input Tensors for prediction
        is_input: bool = False

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(
            config.column,
            config.allow_unknown,
            config.pad_in_vocab,
            config.label_vocab,
            config.is_input,
        )

    def __init__(
        self,
        label_column: str = "label",
        allow_unknown: bool = False,
        pad_in_vocab: bool = False,
        label_vocab: Optional[List[str]] = None,
        is_input: bool = Config.is_input,
    ):
        self.label_column = label_column
        self.pad_in_vocab = pad_in_vocab
        self.vocab_builder = VocabBuilder()
        self.vocab_builder.use_pad = pad_in_vocab
        self.vocab_builder.use_unk = allow_unknown
        self.vocab = None
        self.pad_idx = -1
        if label_vocab:
            self.vocab_builder.add_all(label_vocab)
            self.vocab, self.pad_idx = self._create_vocab()
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.label_column, str)]

[docs]    def initialize(self, from_scratch=True):
        """
        Look through the dataset for all labels and create a vocab map for them.
        """
        if self.vocab and from_scratch:
            return
        try:
            while True:
                row = yield
                labels = row[self.label_column]
                self.vocab_builder.add_all(labels)
        except GeneratorExit:
            self.vocab, self.pad_idx = self._create_vocab()

    def _create_vocab(self):
        vocab = self.vocab_builder.make_vocab()
        pad_idx = (
            vocab.get_pad_index()
            if self.pad_in_vocab
            else Padding.DEFAULT_LABEL_PAD_IDX
        )
        return vocab, pad_idx

[docs]    def numberize(self, row):
        """Numberize labels."""
        return self.vocab.lookup_all(row[self.label_column])

[docs]    def tensorize(self, batch):
        return pad_and_tensorize(batch, self.pad_idx)


[docs]class LabelListTensorizer(LabelTensorizer):
    """LabelListTensorizer takes a list of labels as input and generate a tuple
    of tensors (label_idx, list_length).
    """

    def __init__(self, label_column: str = "label", *args, **kwargs):
        super().__init__(label_column, *args, **kwargs)

    @property
    def column_schema(self):
        return [(self.label_column, List[str])]

[docs]    def numberize(self, row):
        labels = super().numberize(row)
        return labels, len(labels)

[docs]    def tensorize(self, batch):
        labels, labels_len = zip(*batch)
        return super().tensorize(labels), pad_and_tensorize(labels_len)

[docs]    def sort_key(self, row):
        # use list length as sort key
        return row[1]


[docs]class UidTensorizer(Tensorizer):
    """Numberize user IDs which can be either strings or tensors."""

[docs]    class Config(Tensorizer.Config):
        column: str = "uid"
        # Allow unknown users during prediction.
        allow_unknown: bool = True

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(config.column, config.allow_unknown, config.is_input)

    def __init__(
        self,
        uid_column: str = "uid",
        allow_unknown: bool = True,
        is_input: bool = Config.is_input,
    ):
        self.uid_column = uid_column
        self.vocab_builder = VocabBuilder()
        # User IDs should have the same lengths so need not to use padding.
        self.vocab_builder.use_pad = False
        self.vocab_builder.use_unk = allow_unknown
        self.vocab = None
        self.pad_idx = -1
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.uid_column, str)]

    def _get_row_value_as_str(self, row) -> str:
        """Handle the case that the row value is not a string."""
        row_value = row[self.uid_column]
        if isinstance(row_value, torch.Tensor):
            assert (
                row_value.dim() == 0 or len(row_value) == 1
            ), "Cannot get the value of multi-dimensional tensors."
            row_value = str(row_value.item())
        return row_value

[docs]    def initialize(self, from_scratch=True):
        """
        Look through the dataset for all uids and create a vocab map for them.
        """
        if self.vocab and from_scratch:
            return
        try:
            while True:
                row = yield
                uids = self._get_row_value_as_str(row)
                self.vocab_builder.add_all(uids)
        except GeneratorExit:
            self.vocab, self.pad_idx = self._create_vocab()

    def _create_vocab(self):
        vocab = self.vocab_builder.make_vocab()
        pad_idx = Padding.DEFAULT_LABEL_PAD_IDX
        return vocab, pad_idx

[docs]    def numberize(self, row):
        """Numberize uids."""
        return self.vocab.lookup_all(self._get_row_value_as_str(row))

[docs]    def tensorize(self, batch):
        return pad_and_tensorize(batch, self.pad_idx)


[docs]class SoftLabelTensorizer(LabelTensorizer):
    """
    Handles numberizing labels for knowledge distillation. This still requires the same
    label column as `LabelTensorizer` for the "true" label, but also processes soft
    "probabilistic" labels generated from a teacher model, via three new columns.
    """

[docs]    class Config(LabelTensorizer.Config):
        probs_column: str = "target_probs"
        logits_column: str = "target_logits"
        labels_column: str = "target_labels"

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(
            config.column,
            config.allow_unknown,
            config.pad_in_vocab,
            config.label_vocab,
            config.probs_column,
            config.logits_column,
            config.labels_column,
            config.is_input,
        )

    def __init__(
        self,
        label_column: str = "label",
        allow_unknown: bool = False,
        pad_in_vocab: bool = False,
        label_vocab: Optional[List[str]] = None,
        probs_column: str = "target_probs",
        logits_column: str = "target_logits",
        labels_column: str = "target_labels",
        is_input: bool = Config.is_input,
    ):
        super().__init__(
            label_column, allow_unknown, pad_in_vocab, label_vocab, is_input
        )
        self.probs_column = probs_column
        self.logits_column = logits_column
        self.labels_column = labels_column

    @property
    def column_schema(self):
        return [
            (self.label_column, str),
            (self.probs_column, List[float]),
            (self.logits_column, List[float]),
            (self.labels_column, List[str]),
        ]

[docs]    def numberize(self, row):
        """Numberize hard and soft labels"""
        label = self.vocab.lookup_all(row[self.label_column])
        row_labels = row[self.labels_column]
        probs = align_target_label(row[self.probs_column], row_labels, self.vocab.idx)
        logits = align_target_label(row[self.logits_column], row_labels, self.vocab.idx)
        return label, probs, logits

[docs]    def tensorize(self, batch):
        label, probs, logits = zip(*batch)
        # Set probs and logits shape because they should not change with fp16
        probs_shape = len(probs), len(self.vocab)
        return (
            pad_and_tensorize(label, self.pad_idx),
            pad_and_tensorize(probs, dtype=torch.float, pad_shape=probs_shape),
            pad_and_tensorize(logits, dtype=torch.float, pad_shape=probs_shape),
        )


[docs]class NumericLabelTensorizer(Tensorizer):
    """Numberize numeric labels."""

[docs]    class Config(Tensorizer.Config):
        #: The name of the label column to parse from the data source.
        column: str = "label"
        #: If provided, the range of values the raw label can be. Will rescale the
        #: label values to be within [0, 1].
        rescale_range: Optional[List[float]] = None
        # Indicate if it can be used to generate input Tensors for prediction
        is_input: bool = False

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(config.column, config.rescale_range, config.is_input)

    def __init__(
        self,
        label_column: str = Config.column,
        rescale_range: Optional[List[float]] = Config.rescale_range,
        is_input: bool = Config.is_input,
    ):
        self.label_column = label_column
        if rescale_range is not None:
            assert len(rescale_range) == 2
            assert rescale_range[0] < rescale_range[1]
        self.rescale_range = rescale_range
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.label_column, str)]

[docs]    def numberize(self, row):
        """Numberize labels."""
        label = float(row[self.label_column])
        if self.rescale_range is not None:
            label -= self.rescale_range[0]
            label /= self.rescale_range[1] - self.rescale_range[0]
            assert 0 <= label <= 1
        return label

[docs]    def tensorize(self, batch):
        return pad_and_tensorize(batch, dtype=torch.float)


[docs]class FloatListTensorizer(Tensorizer):
    """Numberize numeric labels."""

[docs]    class Config(Tensorizer.Config):
        #: The name of the label column to parse from the data source.
        column: str
        error_check: bool = False
        dim: Optional[int] = None
        # If you wish to normalize the training data here, you probably also
        # want to normalize the inference data. This is currently supported with
        # TorchScript models (see DocModel). See T48207828 for progress on
        # supporting Caffe2 models.
        normalize: bool = False

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(
            config.column,
            config.error_check,
            config.dim,
            config.normalize,
            config.is_input,
        )

    def __init__(
        self,
        column: str,
        error_check: bool,
        dim: Optional[int],
        normalize: bool,
        is_input: bool = Config.is_input,
    ):
        self.column = column
        self.error_check = error_check
        self.dim = dim
        self.normalizer = VectorNormalizer(dim, normalize)
        assert not self.error_check or self.dim is not None, "Error check requires dim"
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.column, List[float])]

[docs]    def initialize(self):
        if not self.normalizer.do_normalization:
            self.normalizer.calculate_feature_stats()
            return
        try:
            while True:
                row = yield
                res = row[self.column]
                self.normalizer.update_meta_data(res)
        except GeneratorExit:
            self.normalizer.calculate_feature_stats()

[docs]    def numberize(self, row):
        dense = row[self.column]
        if self.error_check:
            assert (
                len(dense) == self.dim
            ), f"Dense feature didn't match expected dimension {self.dim}: {dense}"
        return self.normalizer.normalize([dense])[0]

[docs]    def tensorize(self, batch):
        return pad_and_tensorize(batch, dtype=torch.float)


NO_LABEL = SpecialToken("NoLabel")


[docs]class SlotLabelTensorizer(Tensorizer):
    """Numberize word/slot labels."""

[docs]    class Config(Tensorizer.Config):
        #: The name of the slot label column to parse from the data source.
        slot_column: str = "slots"
        #: The name of the text column to parse from the data source.
        #: We need this to be able to generate tensors which correspond to input text.
        text_column: str = "text"
        #: The tokenizer to use to split input text into tokens. This should be
        #: configured in a way which yields tokens consistent with the tokens input to
        #: or output by a model, so that the labels generated by this tensorizer
        #: will match the indices of the model's tokens.
        tokenizer: Tokenizer.Config = Tokenizer.Config()
        #: Whether to allow for unknown labels at test/prediction time
        allow_unknown: bool = False
        # Indicate if it can be used to generate input Tensors for prediction
        is_input: bool = False

[docs]    @classmethod
    def from_config(cls, config: Config):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        return cls(
            config.slot_column,
            config.text_column,
            tokenizer,
            config.allow_unknown,
            config.is_input,
        )

    def __init__(
        self,
        slot_column: str = Config.slot_column,
        text_column: str = Config.text_column,
        tokenizer: Tokenizer = None,
        allow_unknown: bool = Config.allow_unknown,
        is_input: bool = Config.is_input,
    ):
        self.slot_column = slot_column
        self.text_column = text_column
        self.allow_unknown = allow_unknown
        self.tokenizer = tokenizer or Tokenizer()
        self.pad_idx = Padding.DEFAULT_LABEL_PAD_IDX
        self.vocab_builder = VocabBuilder()
        self.vocab_builder.add(NO_LABEL)
        self.vocab_builder.use_pad = False
        self.vocab_builder.use_unk = self.allow_unknown
        self.vocab = None
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.text_column, str), (self.slot_column, List[Slot])]

[docs]    def initialize(self, from_scratch=True):
        """Look through the dataset for all labels and create a vocab map for them."""
        if self.vocab and from_scratch:
            return
        try:
            while True:
                row = yield
                slots = row[self.slot_column]
                self.vocab_builder.add_all(s.label for s in slots)
        except GeneratorExit:
            self.vocab = self.vocab_builder.make_vocab()

[docs]    def numberize(self, row):
        """
        Turn slot labels and text into a list of token labels with the same
        length as the number of tokens in the text.
        """
        slots = row[self.slot_column]
        text = row[self.text_column]
        tokens = self.tokenizer.tokenize(text)
        indexed_tokens = tokens
        labels = []
        current_slot = 0
        current_token = 0
        while current_token < len(tokens) and current_slot < len(slots):
            _, start, end = indexed_tokens[current_token]
            slot = slots[current_slot]
            if start > slot.end:
                current_slot += 1
            else:
                current_token += 1
                labels.append(slot.label if end > slot.start else NO_LABEL)
        labels += [NO_LABEL] * (len(tokens) - current_token)
        return self.vocab.lookup_all(labels)

[docs]    def tensorize(self, batch):
        return pad_and_tensorize(batch, dtype=torch.long)


[docs]class SlotLabelTensorizerExpansible(SlotLabelTensorizer):
    """Create a base SlotLabelTensorizer to support selecting different
       types in ModelInput."""

    __EXPANSIBLE__ = True


[docs]class GazetteerTensorizer(Tensorizer):
    """
    Create 3 tensors for dict features.

    - idx: index of feature in token order.
    - weights: weight of feature in token order.
    - lens: number of features per token.

    For each input token, there will be the same number of `idx` and `weights` entries.
    (equal to the max number of features any token has in this row). The values
    in `lens` will tell how many of these features are actually used per token.

    Input format for the dict column is json and should be a list of dictionaries
    containing the "features" and their weight for each relevant "tokenIdx". Example:
    ::

        text: "Order coffee from Starbucks please"
        dict: [
            {"tokenIdx": 1, "features": {"drink/beverage": 0.8, "music/song": 0.2}},
            {"tokenIdx": 3, "features": {"store/coffee_shop": 1.0}}
        ]

    if we assume this vocab
    ::

        vocab = {
            UNK: 0, PAD: 1,
            "drink/beverage": 2, "music/song": 3, "store/coffee_shop": 4
        }

    this example will result in those tensors:
    ::

        idx =     [1,   1,   2,   3,   1,   1,   4,   1,   1,   1]
        weights = [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
        lens =    [1,        2,        1,        1,        1]

    """

[docs]    class Config(Tensorizer.Config):
        text_column: str = "text"
        dict_column: str = "dict"
        #: tokenizer to split text and create dict tensors of the same size.
        tokenizer: Tokenizer.Config = Tokenizer.Config()

[docs]    @classmethod
    def from_config(cls, config: Config):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        return cls(config.text_column, config.dict_column, tokenizer, config.is_input)

    def __init__(
        self,
        text_column: str = Config.text_column,
        dict_column: str = Config.dict_column,
        tokenizer: Tokenizer = None,
        is_input: bool = Config.is_input,
    ):
        self.text_column = text_column
        self.dict_column = dict_column
        self.tokenizer = tokenizer or Tokenizer()
        self.vocab_builder = VocabBuilder()
        self.vocab = None
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.text_column, str), (self.dict_column, Gazetteer)]

[docs]    def initialize(self, from_scratch=True):
        """
        Look through the dataset for all dict features to create vocab.
        """
        if self.vocab and from_scratch:
            return
        try:
            while True:
                row = yield
                for token_dict in row[self.dict_column]:
                    self.vocab_builder.add_all(token_dict["features"])
        except GeneratorExit:
            self.vocab = self.vocab_builder.make_vocab()

[docs]    def numberize(self, row):
        """
        Numberize dict features. Fill in for tokens with no features with
        PAD and weight 0.0. All tokens need to have at least one entry.
        Tokens with more than one feature will have multiple idx and weight
        added in sequence.
        """

        num_tokens = len(self.tokenizer.tokenize(row[self.text_column]))
        num_labels = max(len(t["features"]) for t in row[self.dict_column])
        res_idx = [self.vocab.get_pad_index()] * (num_labels * num_tokens)
        res_weights = [0.0] * (num_labels * num_tokens)
        res_lens = [1] * num_tokens
        for dict_feature in row[self.dict_column]:
            idx = dict_feature["tokenIdx"]
            feats = dict_feature["features"]
            pos = idx * num_labels
            res_lens[idx] = len(feats)
            # write values at the correct pos
            for label, weight in feats.items():
                res_idx[pos] = self.vocab.lookup_all(label)
                res_weights[pos] = weight
                pos += 1

        return res_idx, res_weights, res_lens

[docs]    def tensorize(self, batch):
        # Pad a minibatch of dictionary features to be
        # batch_size * max_number_of_words * max_number_of_features
        # unpack the minibatch
        feats, weights, lengths = zip(*batch)
        lengths_flattened = [l for l_list in lengths for l in l_list]
        seq_lens = [len(l_list) for l_list in lengths]
        max_ex_len = max(seq_lens)
        max_feat_len = max(lengths_flattened)
        all_lengths, all_feats, all_weights = [], [], []
        for i, seq_len in enumerate(seq_lens):
            ex_feats, ex_weights, ex_lengths = [], [], []
            feats_lengths, feats_vals, feats_weights = lengths[i], feats[i], weights[i]
            max_feat_len_example = max(feats_lengths)
            r_offset = 0
            for _ in feats_lengths:
                # The dict feats obtained from the featurizer will have necessary
                # padding at the utterance level. Therefore we move the offset by
                # max feature length in the example.
                ex_feats.extend(feats_vals[r_offset : r_offset + max_feat_len_example])
                ex_feats.extend(
                    [self.vocab.get_pad_index()] * (max_feat_len - max_feat_len_example)
                )
                ex_weights.extend(
                    feats_weights[r_offset : r_offset + max_feat_len_example]
                )
                ex_weights.extend([0.0] * (max_feat_len - max_feat_len_example))
                r_offset += max_feat_len_example
            ex_lengths.extend(feats_lengths)
            # Pad examples
            ex_padding = (max_ex_len - seq_len) * max_feat_len
            ex_feats.extend([self.vocab.get_pad_index()] * ex_padding)
            ex_weights.extend([0.0] * ex_padding)
            ex_lengths.extend([1] * (max_ex_len - seq_len))
            all_feats.append(ex_feats)
            all_weights.append(ex_weights)
            all_lengths.append(ex_lengths)
        return (
            cuda.tensor(all_feats, torch.long),
            cuda.tensor(all_weights, torch.float),
            cuda.tensor(all_lengths, torch.long),
        )


[docs]class SeqTokenTensorizer(Tensorizer):
    """
    Tensorize a sequence of sentences. The input is a list of strings,
    like this one:
    ::

        ["where do you wanna meet?", "MPK"]

    if we assume this vocab
    ::

        vocab  {
          UNK: 0, PAD: 1,
          'where': 2, 'do': 3, 'you': 4, 'wanna': 5, 'meet?': 6, 'mpk': 7
        }

    this example will result in those tensors:
    ::

        idx = [[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]]
        seq_len = [2]

    If you're using BOS, EOS, BOL and EOL, the vocab will look like this
    ::

        vocab  {
          UNK: 0, PAD: 1,  BOS: 2, EOS: 3, BOL: 4, EOL: 5
          'where': 6, 'do': 7, 'you': 8, 'wanna': 9, 'meet?': 10, 'mpk': 11
        }

    this example will result in those tensors:
    ::

        idx = [
            [2,  4, 3, 1, 1,  1, 1],
            [2,  6, 7, 8, 9, 10, 3],
            [2, 11, 3, 1, 1,  1, 1],
            [2,  5, 3, 1, 1,  1, 1]
        ]
        seq_len = [4]

    """

[docs]    class Config(Tensorizer.Config):
        column: str = "text_seq"
        max_seq_len: Optional[int] = None
        #: sentence markers
        add_bos_token: bool = False
        add_eos_token: bool = False
        use_eos_token_for_bos: bool = False
        #: list markers
        add_bol_token: bool = False
        add_eol_token: bool = False
        use_eol_token_for_bol: bool = False
        #: The tokenizer to use to split input text into tokens.
        tokenizer: Tokenizer.Config = Tokenizer.Config()

[docs]    @classmethod
    def from_config(cls, config: Config):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        return cls(
            column=config.column,
            tokenizer=tokenizer,
            add_bos_token=config.add_bos_token,
            add_eos_token=config.add_eos_token,
            use_eos_token_for_bos=config.use_eos_token_for_bos,
            add_bol_token=config.add_bol_token,
            add_eol_token=config.add_eol_token,
            use_eol_token_for_bol=config.use_eol_token_for_bol,
            max_seq_len=config.max_seq_len,
            is_input=config.is_input,
        )

    def __init__(
        self,
        column: str = Config.column,
        tokenizer=None,
        add_bos_token: bool = Config.add_bos_token,
        add_eos_token: bool = Config.add_eos_token,
        use_eos_token_for_bos: bool = Config.use_eos_token_for_bos,
        add_bol_token: bool = Config.add_bol_token,
        add_eol_token: bool = Config.add_eol_token,
        use_eol_token_for_bol: bool = Config.use_eol_token_for_bol,
        max_seq_len=Config.max_seq_len,
        vocab=None,
        is_input: bool = Config.is_input,
    ):
        self.column = column
        self.tokenizer = tokenizer or Tokenizer()
        self.vocab = vocab
        self.vocab_builder = None
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        self.add_bol_token = add_bol_token
        self.add_eol_token = add_eol_token
        self.use_eol_token_for_bol = use_eol_token_for_bol
        self.max_seq_len = max_seq_len or 2 ** 30  # large number
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.column, List[str])]

[docs]    def initialize(self, vocab_builder=None, from_scratch=True):
        """Build vocabulary based on training corpus."""
        if self.vocab and from_scratch:
            return
        if not self.vocab_builder:
            self.vocab_builder = vocab_builder or VocabBuilder()
            self.vocab_builder.use_bos = self.add_bos_token
            self.vocab_builder.use_eos = self.add_eos_token
            self.vocab_builder.use_bol = self.add_bol_token
            self.vocab_builder.use_eol = self.add_eol_token

        try:
            while True:
                row = yield
                for raw_text in row[self.column]:
                    tokenized = self.tokenizer.tokenize(raw_text)
                    self.vocab_builder.add_all([t.value for t in tokenized])
        except GeneratorExit:
            self.vocab = self.vocab_builder.make_vocab()

    _lookup_tokens = TokenTensorizer._lookup_tokens
    _tokenize = TokenTensorizer._tokenize

[docs]    def numberize(self, row):
        """Tokenize, look up in vocabulary."""
        return self._process(row, raw_token_output=False)

[docs]    def prepare_input(self, row):
        """Tokenize, return tokenized_texts in raw text"""
        seq, seq_lens = self._process(row, raw_token_output=True)
        # convert all special tokens to str
        return [[str(token) for token in sen] for sen in seq], seq_lens

    def _process(self, row, raw_token_output):
        sentence_process_fn = (
            self._tokenize if raw_token_output else self._lookup_tokens
        )
        pad_token = (
            self.vocab.pad_token if raw_token_output else self.vocab.get_pad_index()
        )
        seq = []

        if self.add_bol_token:
            bol = EOL if self.use_eol_token_for_bol else BOL
            tokens, _, _ = sentence_process_fn(pre_tokenized=[Token(bol, -1, -1)])
            seq.append(list(tokens))

        for raw_text in row[self.column]:
            tokens, _, _ = sentence_process_fn(raw_text)
            seq.append(list(tokens))

        if self.add_eol_token:
            tokens, _, _ = sentence_process_fn(pre_tokenized=[Token(EOL, -1, -1)])
            seq.append(list(tokens))

        max_len = max(len(sentence) for sentence in seq)
        for sentence in seq:
            pad_len = max_len - len(sentence)
            if pad_len:
                sentence += [pad_token] * pad_len
        return seq, len(seq)

[docs]    def tensorize(self, batch):
        tokens, seq_lens = zip(*batch)
        return (
            pad_and_tensorize(tokens, self.vocab.get_pad_index()),
            pad_and_tensorize(seq_lens),
        )

[docs]    def sort_key(self, row):
        # use seq_len as sort key
        return row[1]


[docs]class AnnotationNumberizer(Tensorizer):
    """
    Not really a Tensorizer (since it does not create tensors) but technically
    serves the same function. This class parses Annotations in the format below
    and extracts the actions (type List[List[int]])
    ::

        [IN:GET_ESTIMATED_DURATION How long will it take to [SL:METHOD_TRAVEL
        drive ] from [SL:SOURCE Chicago ] to [SL:DESTINATION Mississippi ] ]

    Extraction algorithm is handled by Annotation class. We only care about
    the list of actions, which before vocab index lookups would look like:
    ::

        [
            IN:GET_ESTIMATED_DURATION, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
            SL:METHOD_TRAVEL, SHIFT, REDUCE,
            SHIFT,
            SL:SOURCE, SHIFT, REDUCE,
            SHIFT,
            SL:DESTINATION, SHIFT, REDUCE,
        ]

    """

[docs]    class Config(Tensorizer.Config):
        column: str = "seqlogical"

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(column=config.column, is_input=config.is_input)

    def __init__(
        self, column: str = Config.column, vocab=None, is_input: bool = Config.is_input
    ):
        self.column = column
        self.vocab = vocab
        self.vocab_builder = None
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.column, str)]

[docs]    def initialize(self, vocab_builder=None, from_scratch=True):
        """Build vocabulary based on training corpus."""
        if self.vocab and from_scratch:
            return
        if not self.vocab_builder:
            self.vocab_builder = vocab_builder or VocabBuilder()
            self.vocab_builder.use_unk = False
            self.vocab_builder.use_pad = False

        try:
            while True:
                row = yield
                annotation = Annotation(row[self.column])
                actions = annotation.tree.to_actions()
                self.vocab_builder.add_all(actions)
        except GeneratorExit:
            self.vocab = self.vocab_builder.make_vocab()
            self.shift_idx = self.vocab.idx[SHIFT]
            self.reduce_idx = self.vocab.idx[REDUCE]

            def filterVocab(fn):
                return [token for nt, token in self.vocab.idx.items() if fn(nt)]

            self.ignore_subNTs_roots = filterVocab(is_unsupported)
            self.valid_NT_idxs = filterVocab(is_valid_nonterminal)
            self.valid_IN_idxs = filterVocab(is_intent_nonterminal)
            self.valid_SL_idxs = filterVocab(is_slot_nonterminal)

[docs]    def numberize(self, row):
        """Tokenize, look up in vocabulary."""
        annotation = Annotation(row[self.column])
        return self.vocab.lookup_all(annotation.tree.to_actions())

[docs]    def tensorize(self, batch):
        return batch


[docs]class MetricTensorizer(Tensorizer):
    """A tensorizer which use other tensorizers' numerized data.
       Used mostly for metric reporting."""

[docs]    class Config(Tensorizer.Config):
        names: List[str]
        indexes: List[int]
        # Indicate if it can be used to generate input Tensors for prediction
        is_input: bool = False

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(config.names, config.indexes, config.is_input)

    def __init__(
        self, names: List[str], indexes: List[int], is_input: bool = Config.is_input
    ):
        self.names = names
        self.indexes = indexes
        super().__init__(is_input)

[docs]    def numberize(self, row):
        # metric tensorizer will depends on other tensorizers' numeric result
        return None

[docs]    def tensorize(self, batch):
        raise NotImplementedError


[docs]class NtokensTensorizer(MetricTensorizer):
    """A tensorizer which will reference another tensorizer's numerized data
       to calculate the num tokens.
       Used for calculating tokens per second."""

[docs]    def tensorize(self, batch):
        ntokens = 0
        for name, index in zip(self.names, self.indexes):
            ntokens += sum((sample[index] for sample in batch[name]))
        return ntokens


[docs]class FloatTensorizer(Tensorizer):
    """A tensorizer for reading in scalars from the data."""

[docs]    class Config(Tensorizer.Config):
        #: The name of the column to parse from the data source.
        column: str

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(config.column, config.is_input)

    def __init__(self, column: str, is_input: bool = Config.is_input):
        self.column = column
        super().__init__(is_input)

    @property
    def column_schema(self):
        return [(self.column, float)]

[docs]    def numberize(self, row):
        return row[self.column]

[docs]    def tensorize(self, batch):
        return cuda.tensor(batch, torch.float)


[docs]def initialize_tensorizers(tensorizers, data_source, from_scratch=True):
    """A utility function to stream a data source to the initialize functions
    of a dict of tensorizers."""
    initializers = []
    for init in [
        tensorizer.initialize(from_scratch=from_scratch)
        if hasattr(tensorizer, "vocab")
        else tensorizer.initialize()
        for tensorizer in tensorizers.values()
    ]:
        try:
            init.send(None)  # kick
            initializers.append(init)
        except StopIteration:
            pass

    if initializers:
        for row in data_source:
            for init in initializers:
                init.send(row)