Source code for pytext.data.tensorizers

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import contextlib
import copy
import json
import sys
from itertools import chain
from typing import List, Optional, Tuple

import torch
from pytext.common import Padding, constants
from pytext.config.component import Component, ComponentType, create_component
from pytext.data.data_structures.annotation import (
    REDUCE,
    SHIFT,
    Annotation,
    is_intent_nonterminal,
    is_slot_nonterminal,
    is_unsupported,
    is_valid_nonterminal,
)
from pytext.data.sources.data_source import Gazetteer
from pytext.data.tokenizers import Token, Tokenizer
from pytext.torchscript.tensorizer import (
    ScriptFloat1DListTensorizer,
    ScriptFloatListSeqTensorizer,
    ScriptInteger1DListTensorizer,
    VectorNormalizer,
)
from pytext.torchscript.tokenizer import ScriptDoNothingTokenizer
from pytext.torchscript.utils import ScriptBatchInput, pad_3d, validate_padding_control
from pytext.torchscript.vocab import ScriptVocabulary
from pytext.utils import cuda, precision
from pytext.utils.data import Slot
from pytext.utils.file_io import PathManager
from pytext.utils.lazy import lazy_property
from pytext.utils.precision import maybe_half
from pytext.utils.usage import log_class_usage

from .utils import (
    BOL,
    BOS,
    BYTE_BOS,
    BYTE_EOS,
    EOL,
    EOS,
    PAD,
    VocabBuilder,
    Vocabulary,
    align_target_label,
    pad_and_tensorize,
)


[docs]@contextlib.contextmanager def to_device(tensorizer_script_impl, device): cur_device = tensorizer_script_impl.device tensorizer_script_impl.device = device yield tensorizer_script_impl.device = cur_device
[docs]def tokenize( text: str = None, pre_tokenized: List[Token] = None, tokenizer: Tokenizer = None, bos_token: Optional[str] = None, eos_token: Optional[str] = None, pad_token: str = PAD, use_eos_token_for_bos: bool = False, max_seq_len: int = 2 ** 30, ): tokenized = ( pre_tokenized or tokenizer.tokenize(text)[ : max_seq_len - (bos_token is not None) - (eos_token is not None) ] ) if bos_token: if use_eos_token_for_bos: bos_token = eos_token tokenized = [Token(bos_token, -1, -1)] + tokenized if eos_token: tokenized.append(Token(eos_token, -1, -1)) if not tokenized: tokenized = [Token(pad_token, -1, -1)] tokenized_texts, start_idx, end_idx = zip( *((t.value, t.start, t.end) for t in tokenized) ) return tokenized_texts, start_idx, end_idx
[docs]def lookup_tokens( text: str = None, pre_tokenized: List[Token] = None, tokenizer: Tokenizer = None, vocab: Vocabulary = None, bos_token: Optional[str] = None, eos_token: Optional[str] = None, pad_token: str = PAD, use_eos_token_for_bos: bool = False, max_seq_len: int = 2 ** 30, ): tokenized_texts, start_idx, end_idx = tokenize( text, pre_tokenized, tokenizer, bos_token, eos_token, pad_token, use_eos_token_for_bos, max_seq_len, ) tokens = vocab.lookup_all(tokenized_texts) return tokens, start_idx, end_idx
[docs]class TensorizerScriptImpl(torch.nn.Module): device: str seq_padding_control: Optional[List[int]] batch_padding_control: Optional[List[int]] def __init__(self): super().__init__() self.device: str = "" # padding_control options: # None - no padding # [0, pad1, pad2, pad3,...] - pads sequence/batch length to smallest padX larger than sequence self.seq_padding_control = None self.batch_padding_control = None
[docs] @torch.jit.export def set_device(self, device: str): self.device = device
[docs] @torch.jit.export def set_padding_control(self, dimension: str, padding_control: Optional[List[int]]): """ This functions will be called to set a padding style. None - No padding List: first element 0, round seq length to the smallest list element larger than inputs """ if not validate_padding_control(padding_control): raise RuntimeError("Malformed padding_control value") if dimension == "sequence_length": self.seq_padding_control = padding_control elif dimension == "batch_length": self.batch_padding_control = padding_control else: raise RuntimeError("Illegal padding dimension specified.")
[docs] def batch_size(self, inputs: ScriptBatchInput) -> int: texts: Optional[List[List[str]]] = inputs.texts tokens: Optional[List[List[List[str]]]] = inputs.tokens if texts is not None: return len(texts) elif tokens is not None: return len(tokens) else: raise RuntimeError("Empty input for both texts and tokens.")
[docs] def row_size(self, inputs: ScriptBatchInput) -> int: texts: Optional[List[List[str]]] = inputs.texts tokens: Optional[List[List[List[str]]]] = inputs.tokens if texts is not None: return len(texts[0]) elif tokens is not None: return len(tokens[0]) else: raise RuntimeError("Empty input for both texts and tokens.")
[docs] def get_texts_by_index( self, texts: Optional[List[List[str]]], index: int ) -> Optional[List[str]]: if texts is None or len(texts) == 0: return None return texts[index]
[docs] def get_tokens_by_index( self, tokens: Optional[List[List[List[str]]]], index: int ) -> Optional[List[List[str]]]: if tokens is None or len(tokens) == 0: return None return tokens[index]
[docs] def tokenize(self, *args, **kwargs): """ This functions will receive the inputs from Clients, usually there are two possible inputs 1) a row of texts: List[str] 2) a row of pre-processed tokens: List[List[str]] Override this function to be TorchScriptable, e.g you need to declare concrete input arguments with type hints. """ raise NotImplementedError
[docs] def numberize(self, *args, **kwargs): """ This functions will receive the outputs from function: tokenize() or will be called directly from PyTextTensorizer function: numberize(). Override this function to be TorchScriptable, e.g you need to declare concrete input arguments with type hints. """ raise NotImplementedError
[docs] def tensorize(self, *args, **kwargs): """ This functions will receive a list(e.g a batch) of outputs from function numberize(), padding and convert to output tensors. Override this function to be TorchScriptable, e.g you need to declare concrete input arguments with type hints. """ raise NotImplementedError
[docs] @torch.jit.ignore def tensorize_wrapper(self, *args, **kwargs): """ This functions will receive a list(e.g a batch) of outputs from function numberize(), padding and convert to output tensors. It will be called in PyText Tensorizer during training time, this function is not torchscriptiable because it depends on cuda.device(). """ with to_device(self, cuda.device()): return self.tensorize(*args, **kwargs)
[docs] @torch.jit.ignore def torchscriptify(self): return torch.jit.script(self)
[docs]class Tensorizer(Component): """Tensorizers are a component that converts from batches of `pytext.data.type.DataType` instances to tensors. These tensors will eventually be inputs to the model, but the model is aware of the tensorizers and can arrange the tensors they create to conform to its model. Tensorizers have an initialize function. This function allows the tensorizer to read through the training dataset to build up any data that it needs for creating the model. Commonly this is valuable for things like inferring a vocabulary from the training set, or learning the entire set of training labels, or slot labels, etc. """ __COMPONENT_TYPE__ = ComponentType.TENSORIZER __EXPANSIBLE__ = True __TENSORIZER_SCRIPT_IMPL__ = None
[docs] class Config(Component.Config): # Indicate if it can be used to generate input Tensors for prediction is_input: bool = True
[docs] @classmethod def from_config(cls, config: Config): return cls(config.is_input)
def __init__(self, is_input: bool = True): self.is_input = is_input log_class_usage(__class__) @property def column_schema(self): """Generic types don't pickle well pre-3.7, so we don't actually want to store the schema as an attribute. We're already storing all of the columns anyway, so until there's a better solution, schema is a property.""" return []
[docs] def numberize(self, row): raise NotImplementedError
[docs] def prepare_input(self, row): """ Return preprocessed input tensors/blob for caffe2 prediction net.""" return self.numberize(row)
[docs] def sort_key(self, row): raise NotImplementedError
[docs] def tensorize(self, batch): """Tensorizer knows how to pad and tensorize a batch of it's own output.""" return batch
[docs] def initialize(self, from_scratch=True): """ The initialize function is carefully designed to allow us to read through the training dataset only once, and not store it in memory. As such, it can't itself manually iterate over the data source. Instead, the initialize function is a coroutine, which is sent row data. This should look roughly like:: # set up variables here ... try: # start reading through data source while True: # row has type Dict[str, types.DataType] row = yield # update any variables, vocabularies, etc. ... except GeneratorExit: # finalize your initialization, set instance variables, etc. ... See `WordTokenizer.initialize` for a more concrete example. """ return # we need yield here to make this function a generator yield
@lazy_property def tensorizer_script_impl(self): # Script tensorizer is unpickleable, we use lazy_property for # lazy initialization to construct the object during run time. raise NotImplementedError def __getstate__(self): # make a shallow copy of state to avoid side effect on the original object state = copy.copy(vars(self)) state.pop("tensorizer_script_impl", None) return state
[docs] def stringify(self, token_indices): # Used in metric reporter to convert from tokens to string res = "" if hasattr(self, "vocab"): res = " ".join([self.vocab._vocab[index] for index in token_indices]) if hasattr(self, "tokenizer"): if hasattr(self.tokenizer, "decode"): res = self.tokenizer.decode(res) return res
[docs] def torchscriptify(self): return self.tensorizer_script_impl.torchscriptify()
[docs]class VocabFileConfig(Component.Config): #: File containing tokens to add to vocab (first whitespace-separated entry per #: line) filepath: str = "" #: Whether to skip the first line of the file (e.g. if it is a header line) skip_header_line: bool = False #: Whether to lowercase each of the tokens in the file lowercase_tokens: bool = False #: The max number of tokens to add to vocab size_limit: int = 0
[docs]class VocabConfig(Component.Config): #: Whether to add tokens from training data to vocab. build_from_data: bool = True #: Add `size_from_data` most frequent tokens in training data to vocab (if this #: is 0, add all tokens from training data). size_from_data: int = 0 #: Add `min_counts` filter out tokens in training data that with count smaller #: than min_counts. min_counts: int = 0 vocab_files: List[VocabFileConfig] = []
[docs]class TokenTensorizer(Tensorizer): """Convert text to a list of tokens. Do this based on a tokenizer configuration, and build a vocabulary for numberization. Finally, pad the batch to create a square tensor of the correct size. """
[docs] class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False max_seq_len: Optional[int] = None vocab: VocabConfig = VocabConfig() vocab_file_delimiter: str = " "
[docs] @classmethod def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls( text_column=config.column, tokenizer=tokenizer, add_bos_token=config.add_bos_token, add_eos_token=config.add_eos_token, use_eos_token_for_bos=config.use_eos_token_for_bos, max_seq_len=config.max_seq_len, vocab_config=config.vocab, vocab_file_delimiter=config.vocab_file_delimiter, is_input=config.is_input, )
def __init__( self, text_column, tokenizer=None, add_bos_token=Config.add_bos_token, add_eos_token=Config.add_eos_token, use_eos_token_for_bos=Config.use_eos_token_for_bos, max_seq_len=Config.max_seq_len, vocab_config=None, vocab=None, vocab_file_delimiter=" ", is_input=Config.is_input, ): self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.vocab = vocab self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len or 2 ** 30 # large number self.vocab_builder = None self.vocab_config = vocab_config or VocabConfig() self.vocab_file_delimiter = vocab_file_delimiter super().__init__(is_input) @property def column_schema(self): return [(self.text_column, str)] def _tokenize(self, text=None, pre_tokenized=None): return tokenize( text=text, pre_tokenized=pre_tokenized, tokenizer=self.tokenizer, bos_token=self.vocab.bos_token if self.add_bos_token else None, eos_token=self.vocab.eos_token if self.add_eos_token else None, pad_token=self.vocab.pad_token, use_eos_token_for_bos=self.use_eos_token_for_bos, max_seq_len=self.max_seq_len, ) def _lookup_tokens(self, text=None, pre_tokenized=None): return lookup_tokens( text=text, pre_tokenized=pre_tokenized, tokenizer=self.tokenizer, vocab=self.vocab, bos_token=self.vocab.bos_token if self.add_bos_token else None, eos_token=self.vocab.eos_token if self.add_eos_token else None, pad_token=self.vocab.pad_token, use_eos_token_for_bos=self.use_eos_token_for_bos, max_seq_len=self.max_seq_len, ) def _reverse_lookup(self, token_ids): return [self.vocab[id] for id in token_ids]
[docs] def initialize(self, vocab_builder=None, from_scratch=True): """Build vocabulary based on training corpus.""" if self.vocab and from_scratch: if self.vocab_config.build_from_data or self.vocab_config.vocab_files: print( f"`{self.text_column}` column: vocab already provided, skipping " f"adding tokens from data and from vocab files." ) return if not self.vocab_config.build_from_data and not self.vocab_config.vocab_files: raise ValueError( f"To create token tensorizer for '{self.text_column}', either " f"`build_from_data` or `vocab_files` must be set." ) if not self.vocab_builder: # else means not initialize from scratch, self.vocab_builder # would be set already self.vocab_builder = vocab_builder or VocabBuilder( delimiter=self.vocab_file_delimiter ) self.vocab_builder.use_bos = self.add_bos_token self.vocab_builder.use_eos = self.add_eos_token if not self.vocab_config.build_from_data: self._add_vocab_from_files() self.vocab = self.vocab_builder.make_vocab() return try: while True: row = yield raw_text = row[self.text_column] tokenized = self.tokenizer.tokenize(raw_text) self.vocab_builder.add_all([t.value for t in tokenized]) except GeneratorExit: self.vocab_builder.truncate_to_vocab_size( self.vocab_config.size_from_data, self.vocab_config.min_counts ) self._add_vocab_from_files() self.vocab = self.vocab_builder.make_vocab()
def _add_vocab_from_files(self): for vocab_file in self.vocab_config.vocab_files: with PathManager.open(vocab_file.filepath) as f: self.vocab_builder.add_from_file( f, vocab_file.skip_header_line, vocab_file.lowercase_tokens, vocab_file.size_limit, )
[docs] def numberize(self, row): """Tokenize, look up in vocabulary.""" tokens, start_idx, end_idx = self._lookup_tokens(row[self.text_column]) token_ranges = list(zip(start_idx, end_idx)) return tokens, len(tokens), token_ranges
[docs] def prepare_input(self, row): """Tokenize, look up in vocabulary, return tokenized_texts in raw text""" tokenized_texts, start_idx, end_idx = self._tokenize(row[self.text_column]) token_ranges = list(zip(start_idx, end_idx)) return list(tokenized_texts), len(tokenized_texts), token_ranges
[docs] def tensorize(self, batch): tokens, seq_lens, token_ranges = zip(*batch) return ( pad_and_tensorize(tokens, self.vocab.get_pad_index()), pad_and_tensorize(seq_lens), pad_and_tensorize(token_ranges), )
[docs] def sort_key(self, row): # use seq_len as sort key return row[1]
[docs]class ByteTensorizer(Tensorizer): """Turn characters into sequence of int8 bytes. One character will have one or more bytes depending on it's encoding """ UNK_BYTE = 0 PAD_BYTE = 0 NUM = 256
[docs] class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" lower: bool = True max_seq_len: Optional[int] = None add_bos_token: Optional[bool] = False add_eos_token: Optional[bool] = False use_eos_token_for_bos: Optional[bool] = False
[docs] @classmethod def from_config(cls, config: Config): return cls( config.column, config.lower, config.max_seq_len, config.add_bos_token, config.add_eos_token, config.use_eos_token_for_bos, config.is_input, )
def __init__( self, text_column, lower=True, max_seq_len=None, add_bos_token=Config.add_bos_token, add_eos_token=Config.add_eos_token, use_eos_token_for_bos=Config.use_eos_token_for_bos, is_input=Config.is_input, ): self.text_column = text_column self.lower = lower self.max_seq_len = max_seq_len self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos super().__init__(is_input) @property def column_schema(self): return [(self.text_column, str)]
[docs] def numberize(self, row): """Convert text to characters.""" text = row[self.text_column].strip() if self.lower: text = text.lower() bytes = list(text.encode()) if self.max_seq_len: bytes = bytes[: self.max_seq_len] if self.add_bos_token: bos = BYTE_EOS if self.use_eos_token_for_bos else BYTE_BOS if bos in text: print('Special token "{}" exists in text "{}". Exit.'.format(bos, text)) sys.exit(1) bytes = list(bos.encode()) + bytes if self.add_eos_token: if BYTE_EOS in text: print( 'Special token "{}" exists in text "{}". Exit.'.format( BYTE_EOS, text ) ) sys.exit(1) bytes = bytes + list(BYTE_EOS.encode()) return bytes, len(bytes)
[docs] def tensorize(self, batch): bytes, bytes_len = zip(*batch) return pad_and_tensorize(bytes, self.PAD_BYTE), pad_and_tensorize(bytes_len)
[docs] def sort_key(self, row): # use bytes_len as sort key return row[1]
[docs]class ByteTokenTensorizer(Tensorizer): """Turn words into 2-dimensional tensors of int8 bytes. Words are padded to `max_byte_len`. Also computes sequence lengths (1-D tensor) and token lengths (2-D tensor). 0 is the pad byte. """ NUM_BYTES = 256
[docs] class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() #: The max token length for input text. max_seq_len: Optional[int] = None #: The max byte length for a token. max_byte_len: int = 15 #: Offset to add to all non-padding bytes offset_for_non_padding: int = 0 add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False
[docs] @classmethod def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls( text_column=config.column, tokenizer=tokenizer, max_seq_len=config.max_seq_len, max_byte_len=config.max_byte_len, offset_for_non_padding=config.offset_for_non_padding, add_bos_token=config.add_bos_token, add_eos_token=config.add_eos_token, use_eos_token_for_bos=config.use_eos_token_for_bos, is_input=config.is_input, )
def __init__( self, text_column, tokenizer=None, max_seq_len=Config.max_seq_len, max_byte_len=Config.max_byte_len, offset_for_non_padding=Config.offset_for_non_padding, add_bos_token=Config.add_bos_token, add_eos_token=Config.add_eos_token, use_eos_token_for_bos=Config.use_eos_token_for_bos, is_input=Config.is_input, ): self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.max_seq_len = max_seq_len or 2 ** 30 # large number self.max_byte_len = max_byte_len self.offset_for_non_padding = offset_for_non_padding self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos super().__init__(is_input) @property def column_schema(self): return [(self.text_column, str)]
[docs] def numberize(self, row): """Convert text to bytes, pad batch.""" tokens = self.tokenizer.tokenize(row[self.text_column])[ : (self.max_seq_len - self.add_bos_token - self.add_eos_token) ] if self.add_bos_token: bos = EOS if self.use_eos_token_for_bos else BOS tokens = [Token(bos, -1, -1)] + tokens if self.add_eos_token: tokens.append(Token(EOS, -1, -1)) if not tokens: tokens = [Token(PAD, -1, -1)] bytes = [self._numberize_token(token)[: self.max_byte_len] for token in tokens] token_lengths = len(tokens) byte_lengths = [len(token_bytes) for token_bytes in bytes] return bytes, token_lengths, byte_lengths
def _numberize_token(self, token): return [c + self.offset_for_non_padding for c in token.value.encode()]
[docs] def tensorize(self, batch, pad_token=0): bytes, token_lengths, byte_lengths = zip(*batch) # Set bytes shape because byte length should always be `max_byte_len` no # matter how long the bytes in the batch are. pad_shape = ( len(batch), precision.pad_length(max(len(length) for length in byte_lengths)), self.max_byte_len, ) return ( pad_and_tensorize(bytes, pad_shape=pad_shape, pad_token=pad_token), pad_and_tensorize(token_lengths), pad_and_tensorize(byte_lengths), )
[docs] def sort_key(self, row): return len(row[0])
[docs]class Float1DListTensorizer(Tensorizer): """ Tensorizes the 1d list of floats -- List[float] TODO: Even though very similar, 'FloatListTensorizer' currently does not support this vanilla case for tensorization of List[float]. In future, if 'FloatListTensorizer' accommodates this case, we do not need this separate tensorizer. """ __TENSORIZER_SCRIPT_IMPL__ = ScriptFloat1DListTensorizer
[docs] class Config(Tensorizer.Config): # inputs column: str = "float_list_column" pad_token: float = 1.0
[docs] @classmethod def from_config(cls, config: Config, **kwargs): return cls(config, **kwargs)
def __init__(self, config: Config, **kwargs): # mention link probability self.column = config.column self.pad_token = config.pad_token @property def column_schema(self): return [(self.column, List[float])]
[docs] def initialize(self, from_scratch=True): # start reading through data source while True: yield
[docs] def numberize(self, row): assert self.column in row, """1d float-list column not present in the data""" return row[self.column]
[docs] def tensorize(self, batch): values = pad_and_tensorize(batch, pad_token=self.pad_token, dtype=torch.float) return values
@lazy_property def tensorizer_script_impl(self): return ScriptFloat1DListTensorizer()
[docs]class Integer1DListTensorizer(Tensorizer): """ Tensorizes the 1d list of integers -- List[int] """ __TENSORIZER_SCRIPT_IMPL__ = ScriptInteger1DListTensorizer SPAN_PAD_IDX = 0
[docs] class Config(Tensorizer.Config): # inputs column: str = "int_list_column"
[docs] @classmethod def from_config(cls, config: Config, **kwargs): return cls(config, **kwargs)
def __init__(self, config: Config, **kwargs): self.column = config.column @property def column_schema(self): return [(self.column, List[int])]
[docs] def initialize(self, from_scratch=True): # start reading through data source while True: yield
[docs] def numberize(self, row): assert self.column in row, """Integer 1d list column not present in the data""" return row[self.column]
[docs] def tensorize(self, batch): values = pad_and_tensorize(batch, pad_token=self.SPAN_PAD_IDX) return values
@lazy_property def tensorizer_script_impl(self): return ScriptInteger1DListTensorizer()
[docs]class CharacterVocabTokenTensorizerScriptImpl(TensorizerScriptImpl): def __init__( self, add_bos_token: bool, add_eos_token: bool, use_eos_token_for_bos: bool, max_seq_len: int, vocab: Vocabulary, tokenizer: Optional[Tokenizer], ): super().__init__() if tokenizer is not None and hasattr(tokenizer, "torchscriptify"): try: self.tokenizer = tokenizer.torchscriptify() except NotImplementedError: # This is fine as long as the exported tokenizer is only used # in pre-tokenized mode self.tokenizer = None else: self.tokenizer = None self.do_nothing_tokenizer = ScriptDoNothingTokenizer() self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(), bos_idx=vocab.get_bos_index() if add_bos_token else -1, eos_idx=vocab.get_eos_index() if add_eos_token else -1, ) self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len
[docs] def tokenize( self, row_text: Optional[str] = None, row_pre_tokenized: Optional[List[str]] = None, ) -> Tuple[List[List[str]], List[int]]: tokens: List[Tuple[str, int, int]] = [] char_tokens: List[List[str]] = [] char_tokens_lengths: List[int] = [] if row_text is not None: assert self.tokenizer is not None tokens = self.tokenizer.tokenize(row_text) elif row_pre_tokenized is not None: for token in row_pre_tokenized: tokens.extend(self.do_nothing_tokenizer.tokenize(token)) for token in tokens: chars: List[str] = [] for char in token[0]: chars.append(char) char_tokens.append(chars) char_tokens_lengths.append(len(chars)) return char_tokens, char_tokens_lengths
[docs] def numberize( self, char_tokens: List[List[str]], char_tokens_lengths: List[int] ) -> Tuple[List[List[int]], List[int]]: tokens: List[List[int]] = [] tokens = self.vocab.lookup_indices_2d(char_tokens) return tokens, char_tokens_lengths
[docs] def tensorize( self, tokens: List[List[List[int]]], tokens_lengths: List[List[int]], ) -> Tuple[torch.Tensor, torch.Tensor]: tokens_padded: List[List[List[int]]] = [] tokens_lengths_padded: List[List[int]] = [] tokens_padded, tokens_lengths_padded = pad_3d( tokens, tokens_lengths, self.vocab.get_pad_index() ) tokens_tensor: torch.Tensor = torch.tensor(tokens_padded, dtype=torch.long) tokens_lengths_tensor: torch.Tensor = torch.tensor( tokens_lengths_padded, dtype=torch.long ) return (tokens_tensor, tokens_lengths_tensor)
[docs] def get_texts_by_index( self, texts: Optional[List[List[str]]], index: int ) -> Optional[str]: if texts is None or len(texts) == 0: return None # CharacterVocabTokenTensorizer only works with a single text per row, stick with that return texts[index][0]
[docs] def get_tokens_by_index( self, tokens: Optional[List[List[List[str]]]], index: int ) -> Optional[List[str]]: if tokens is None or len(tokens) == 0: return None # CharacterVocabTokenTensorizer only works with a single text per row, stick with that return tokens[index][0]
[docs] def forward(self, inputs: ScriptBatchInput) -> Tuple[torch.Tensor, torch.Tensor]: tokens_3d: List[List[List[int]]] = [] seq_lens_2d: List[List[int]] = [] for idx in range(self.batch_size(inputs)): char_tokens: List[List[int]] = [] char_tokens_lengths: List[int] = [] char_tokens, char_tokens_lengths = self.tokenize( self.get_texts_by_index(inputs.texts, idx), self.get_tokens_by_index(inputs.tokens, idx), ) numberized: Tuple[List[List[int]], List[int]] = self.numberize( char_tokens, char_tokens_lengths ) tokens_3d.append(numberized[0]) seq_lens_2d.append(numberized[1]) return self.tensorize(tokens_3d, seq_lens_2d)
[docs]class CharacterVocabTokenTensorizer(Tensorizer): """Turn words into 2-dimensional tensors of ints based on the char vocab. Words are padded to the maximum word length (also capped at `max_char_length`). Sequence lengths are the length of each token. The difference with pytext.data.tensorizers.CharacterTokenTensorizer is that the CharacterTokenTensorizer uses the ascii value and does not require to build a vocab. Here we tensorize based on the vocab. """ __TENSORIZER_SCRIPT_IMPL__ = CharacterVocabTokenTensorizerScriptImpl
[docs] class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False max_seq_len: Optional[int] = None vocab: VocabConfig = VocabConfig() vocab_file_delimiter: str = " "
[docs] @classmethod def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls( text_column=config.column, tokenizer=tokenizer, add_bos_token=config.add_bos_token, add_eos_token=config.add_eos_token, use_eos_token_for_bos=config.use_eos_token_for_bos, max_seq_len=config.max_seq_len, vocab_config=config.vocab, vocab_file_delimiter=config.vocab_file_delimiter, is_input=config.is_input, )
def __init__( self, text_column, tokenizer=None, add_bos_token=Config.add_bos_token, add_eos_token=Config.add_eos_token, use_eos_token_for_bos=Config.use_eos_token_for_bos, max_seq_len=Config.max_seq_len, vocab_config=None, vocab=None, vocab_file_delimiter=" ", is_input=Config.is_input, ): self.text_column = text_column self.tokenizer = tokenizer or Tokenizer() self.vocab = vocab self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len or 2 ** 30 # large number self.vocab_builder = None self.vocab_config = vocab_config or VocabConfig() self.vocab_file_delimiter = vocab_file_delimiter super().__init__(is_input) @property def column_schema(self): return [(self.text_column, str)]
[docs] def initialize(self, vocab_builder=None, from_scratch=True): """Build vocabulary based on training corpus.""" if self.vocab and from_scratch: if self.vocab_config.build_from_data or self.vocab_config.vocab_files: print( f"`{self.text_column}` column: vocab already provided, skipping " f"adding tokens from data and from vocab files." ) return if not self.vocab_config.build_from_data and not self.vocab_config.vocab_files: raise ValueError( f"To create token tensorizer for '{self.text_column}', either " f"`build_from_data` or `vocab_files` must be set." ) if not self.vocab_builder: # else means not initialize from scratch, self.vocab_builder # would be set already self.vocab_builder = vocab_builder or VocabBuilder( delimiter=self.vocab_file_delimiter ) self.vocab_builder.use_bos = self.add_bos_token self.vocab_builder.use_eos = self.add_eos_token if not self.vocab_config.build_from_data: self._add_vocab_from_files() self.vocab = self.vocab_builder.make_vocab() return try: while True: row = yield raw_text = row[self.text_column] tokenized = self.tokenizer.tokenize(raw_text) # tokenize the word tokens further char_tokenized = self.character_tokenize(tokenized) # build the vocab self.vocab_builder.add_all(char_tokenized) except GeneratorExit: self.vocab_builder.truncate_to_vocab_size( self.vocab_config.size_from_data, self.vocab_config.min_counts ) self._add_vocab_from_files() self.vocab = self.vocab_builder.make_vocab()
[docs] def character_tokenize(self, tokens: List[Token]): res = [] for token in tokens: chars = [] for char in token.value: chars.append(char) res.append(chars) return res
def _add_vocab_from_files(self): for vocab_file in self.vocab_config.vocab_files: with PathManager.open(vocab_file.filepath) as f: self.vocab_builder.add_from_file( f, vocab_file.skip_header_line, vocab_file.lowercase_tokens, vocab_file.size_limit, )
[docs] def numberize(self, row): """Tokenize, look up in vocabulary.""" raw_text = row[self.text_column] tokenized = self.tokenizer.tokenize(raw_text) tokens_in_chars = self.character_tokenize(tokenized) char_tokens = self.vocab.lookup_all(tokens_in_chars) char_tokens_lengths = [len(token) for token in tokens_in_chars] return char_tokens, char_tokens_lengths
[docs] def tensorize(self, batch): char_tokens, char_tokens_lengths = zip(*batch) return ( pad_and_tensorize(char_tokens, self.vocab.get_pad_index()), pad_and_tensorize(char_tokens_lengths), )
@lazy_property def tensorizer_script_impl(self): return self.__TENSORIZER_SCRIPT_IMPL__( add_bos_token=self.add_bos_token, add_eos_token=self.add_eos_token, use_eos_token_for_bos=self.use_eos_token_for_bos, max_seq_len=self.max_seq_len, vocab=self.vocab, tokenizer=self.tokenizer, )
[docs]class CharacterTokenTensorizer(TokenTensorizer): """Turn words into 2-dimensional tensors of ints based on their ascii values. Words are padded to the maximum word length (also capped at `max_char_length`). Sequence lengths are the length of each token, 0 for pad token. """
[docs] class Config(TokenTensorizer.Config): #: The max character length for a token. max_char_length: int = 20
def __init__(self, max_char_length: int = Config.max_char_length, **kwargs): self.max_char_length = max_char_length super().__init__(**kwargs) # Don't need to create a vocab initialize = Tensorizer.initialize
[docs] def numberize(self, row): """Convert text to characters, pad batch.""" tokens = self.tokenizer.tokenize(row[self.text_column])[: self.max_seq_len] characters = [ self._numberize_token(token)[: self.max_char_length] for token in tokens ] token_lengths = len(tokens) char_lengths = [len(token_chars) for token_chars in characters] return characters, token_lengths, char_lengths
def _numberize_token(self, token): return [ord(c) for c in token.value]
[docs] def tensorize(self, batch): characters, token_lengths, char_lengths = zip(*batch) return ( pad_and_tensorize(characters), pad_and_tensorize(token_lengths), pad_and_tensorize(char_lengths), )
[docs] def sort_key(self, row): return len(row[0])
[docs]class LabelTensorizer(Tensorizer): """Numberize labels. Label can be used as either input or target. NB: if the labels are used as targets for binary classification with a loss such as cosine distance, the order of the `label_vocab` *does* matter, and it should be `[negative_class, positive_class]`. """ __EXPANSIBLE__ = True
[docs] class Config(Tensorizer.Config): #: The name of the label column to parse from the data source. column: str = "label" #: Whether to allow for unknown labels at test/prediction time. allow_unknown: bool = False #: Whether vocab should have pad, usually false when label is used as target. pad_in_vocab: bool = False #: The label values, if known. Will skip initialization step if provided. label_vocab: Optional[List[str]] = None #: File with the label values. This can be used when the label space is #: too large to specify these as a list. The file should not contain #: a header. label_vocab_file: Optional[str] = None # Indicate if it can be used to generate input Tensors for prediction. is_input: bool = False #: Add these labels to the vocabulary during the initialization step (only #: if the initialization step is not skipped). Useful when the dataset may #: not include all labels, as for incremental trainings. add_labels: Optional[List[str]] = None
[docs] @classmethod def from_config(cls, config: Config): return cls( config.column, config.allow_unknown, config.pad_in_vocab, config.label_vocab, config.label_vocab_file, config.is_input, config.add_labels, )
def __init__( self, label_column: str = "label", allow_unknown: bool = False, pad_in_vocab: bool = False, label_vocab: Optional[List[str]] = None, label_vocab_file: Optional[str] = None, is_input: bool = Config.is_input, add_labels: Optional[List[str]] = None, ): self.label_column = label_column self.pad_in_vocab = pad_in_vocab self.vocab_builder = VocabBuilder() self.vocab_builder.use_pad = pad_in_vocab self.vocab_builder.use_unk = allow_unknown self.add_labels = add_labels self.vocab = None self.pad_idx = -1 assert ( label_vocab is None or label_vocab_file is None ), "Cannot specify both label_vocab and label_vocab_file" if label_vocab: self.vocab_builder.add_all(label_vocab) self.vocab, self.pad_idx = self._create_vocab() elif label_vocab_file: with PathManager.open(label_vocab_file) as f: self.vocab_builder.add_from_file( f, skip_header_line=False, lowercase_tokens=False, size=None ) self.vocab, self.pad_idx = self._create_vocab() super().__init__(is_input) @property def column_schema(self): return [(self.label_column, str)]
[docs] def initialize(self, from_scratch=True): """ Look through the dataset for all labels and create a vocab map for them. """ if self.vocab and from_scratch: return try: while True: row = yield labels = row[self.label_column] self.vocab_builder.add_all(labels) except GeneratorExit: if self.add_labels: self.vocab_builder.add_all(self.add_labels) self.vocab, self.pad_idx = self._create_vocab()
def _create_vocab(self): if not self.vocab_builder.has_added_tokens(): error_msg = ( "Label classes are not specified, and no examples or labels were found " "in training data. Either the training data is empty, or the data " "fields are misnamed and no examples are parsed (warnings would appear " "in preceding stdout logs)." ) raise ValueError(error_msg) vocab = self.vocab_builder.make_vocab() pad_idx = ( vocab.get_pad_index() if self.pad_in_vocab else Padding.DEFAULT_LABEL_PAD_IDX ) return vocab, pad_idx
[docs] def numberize(self, row): """Numberize labels.""" return self.vocab.lookup_all(row[self.label_column])
[docs] def tensorize(self, batch): return pad_and_tensorize(batch, self.pad_idx)
[docs]class LabelListTensorizer(LabelTensorizer): """LabelListTensorizer takes a list of labels as input and generate a tuple of tensors (label_idx, list_length). """
[docs] class Config(LabelTensorizer.Config): # pad missing label in the list, including None and empty pad_missing: bool = False
[docs] @classmethod def from_config(cls, config: Config): return cls( config.column, config.allow_unknown, config.pad_in_vocab, config.label_vocab, config.label_vocab_file, config.is_input, pad_missing=config.pad_missing, )
def __init__(self, *args, pad_missing: bool = False, **kwargs): super().__init__(*args, **kwargs) self.pad_missing = pad_missing def __setstate__(self, newstate): # for backward compatibility if "pad_missing" not in newstate: newstate["pad_missing"] = True self.__dict__.update(newstate) @property def column_schema(self): return [(self.label_column, List[str])]
[docs] def numberize(self, row): label_idx_list = [] for label in row[self.label_column]: # Only None and empty is viewed as missing data, values like "False" is legit if label in [None, ""]: if self.pad_missing: label_idx_list.append(self.pad_idx) else: raise Exception( "Found none or empty value in the list, \ while pad_missing is disabled" ) else: label_idx_list.append(self.vocab.lookup_all(label)) return label_idx_list, len(label_idx_list)
[docs] def tensorize(self, batch): labels, labels_len = zip(*batch) return super().tensorize(labels), pad_and_tensorize(labels_len)
[docs] def sort_key(self, row): # use list length as sort key return row[1]
[docs]class LabelListRankTensorizer(LabelTensorizer): """LabelListRankTensorizer takes a list of a single array with [[labelA, rankA], [labelB, rankB], ...] as input and generate a tuple of tensors (label_idx, list_length). Example: Input: ["[\"weather\",\"1\"]","[\"business\",\"1\"]"] Output of size len(vocab) {"timer", "weather", "business"} => [0, 1, 1]. This would suggest both labels are of equal rank. """
[docs] class Config(LabelTensorizer.Config): # pad missing label in the list, including None and empty pad_missing: bool = False
[docs] @classmethod def from_config(cls, config: Config): return cls( config.column, config.allow_unknown, config.pad_in_vocab, config.label_vocab, config.label_vocab_file, config.is_input, pad_missing=config.pad_missing, )
def __init__(self, *args, pad_missing: bool = False, **kwargs): super().__init__(*args, **kwargs) self.pad_missing = pad_missing def __setstate__(self, newstate): # for backward compatibility if "pad_missing" not in newstate: newstate["pad_missing"] = True self.__dict__.update(newstate) @property def column_schema(self): return [(self.label_column, List[str])]
[docs] def numberize(self, row): label_idx_list = [0] * len(self.vocab) elem_struct_0 = list(map(json.loads, row[self.label_column])) for elemRow in elem_struct_0: label = elemRow[0] labelRank = int(elemRow[1]) # Only None and empty is viewed as missing data, values like "False" is legit if label in [None, ""]: if self.pad_missing: raise Exception("Invalid state for LabelStructTensorizer") else: raise Exception( "Found none or empty value in the list, \ while pad_missing is disabled" ) else: if labelRank == 1: label_idx_list[self.vocab.lookup_all(label)] = 1 return label_idx_list, len(label_idx_list)
[docs] def tensorize(self, batch): labels, labels_len = zip(*batch) return super().tensorize(labels), pad_and_tensorize(labels_len)
[docs] def sort_key(self, row): # use list length as sort key return row[1]
[docs] def initialize(self, from_scratch=True): """ Look through the dataset for all labels and create a vocab map for them. """ if self.vocab and from_scratch: return try: while True: row = yield elem_struct_0 = list(map(json.loads, row[self.label_column])) for elemRow in elem_struct_0: self.vocab_builder.add_all(elemRow[0]) except GeneratorExit: if self.add_labels: self.vocab_builder.add_all(self.add_labels) self.vocab, self.pad_idx = self._create_vocab()
[docs]class UidTensorizer(Tensorizer): """Numberize user IDs which can be either strings or tensors."""
[docs] class Config(Tensorizer.Config): column: str = "uid" # Allow unknown users during prediction. allow_unknown: bool = True
[docs] @classmethod def from_config(cls, config: Config): return cls(config.column, config.allow_unknown, config.is_input)
def __init__( self, uid_column: str = "uid", allow_unknown: bool = True, is_input: bool = Config.is_input, ): self.uid_column = uid_column self.vocab_builder = VocabBuilder() # User IDs should have the same lengths so need not to use padding. self.vocab_builder.use_pad = False self.vocab_builder.use_unk = allow_unknown self.vocab = None self.pad_idx = -1 super().__init__(is_input) @property def column_schema(self): return [(self.uid_column, str)] def _get_row_value_as_str(self, row) -> str: """Handle the case that the row value is not a string.""" row_value = row[self.uid_column] if isinstance(row_value, torch.Tensor): assert ( row_value.dim() == 0 or len(row_value) == 1 ), "Cannot get the value of multi-dimensional tensors." row_value = str(row_value.item()) return row_value
[docs] def initialize(self, from_scratch=True): """ Look through the dataset for all uids and create a vocab map for them. """ if self.vocab and from_scratch: return try: while True: row = yield uids = self._get_row_value_as_str(row) self.vocab_builder.add_all(uids) except GeneratorExit: self.vocab, self.pad_idx = self._create_vocab()
def _create_vocab(self): vocab = self.vocab_builder.make_vocab() pad_idx = Padding.DEFAULT_LABEL_PAD_IDX return vocab, pad_idx
[docs] def numberize(self, row): """Numberize uids.""" return self.vocab.lookup_all(self._get_row_value_as_str(row))
[docs] def tensorize(self, batch): return pad_and_tensorize(batch, self.pad_idx)
[docs]class SoftLabelTensorizer(LabelTensorizer): """ Handles numberizing labels for knowledge distillation. This still requires the same label column as `LabelTensorizer` for the "true" label, but also processes soft "probabilistic" labels generated from a teacher model, via three new columns. """
[docs] class Config(LabelTensorizer.Config): probs_column: str = "target_probs" logits_column: str = "target_logits" labels_column: str = "target_labels"
[docs] @classmethod def from_config(cls, config: Config): return cls( config.column, config.allow_unknown, config.pad_in_vocab, config.label_vocab, config.probs_column, config.logits_column, config.labels_column, config.label_vocab_file, config.is_input, )
def __init__( self, label_column: str = "label", allow_unknown: bool = False, pad_in_vocab: bool = False, label_vocab: Optional[List[str]] = None, probs_column: str = "target_probs", logits_column: str = "target_logits", labels_column: str = "target_labels", label_vocab_file: Optional[str] = None, is_input: bool = Config.is_input, ): super().__init__( label_column, allow_unknown, pad_in_vocab, label_vocab, label_vocab_file, is_input, ) self.probs_column = probs_column self.logits_column = logits_column self.labels_column = labels_column @property def column_schema(self): return [ (self.label_column, str), (self.probs_column, List[float]), (self.logits_column, List[float]), (self.labels_column, List[str]), ]
[docs] def numberize(self, row): """Numberize hard and soft labels""" label = self.vocab.lookup_all(row[self.label_column]) row_labels = row[self.labels_column] probs = align_target_label(row[self.probs_column], row_labels, self.vocab.idx) logits = align_target_label(row[self.logits_column], row_labels, self.vocab.idx) return label, probs, logits
[docs] def tensorize(self, batch): label, probs, logits = zip(*batch) # Set probs and logits shape because they should not change with fp16 probs_shape = len(probs), len(self.vocab) return ( pad_and_tensorize(label, self.pad_idx), pad_and_tensorize(probs, dtype=torch.float, pad_shape=probs_shape), pad_and_tensorize(logits, dtype=torch.float, pad_shape=probs_shape), )
[docs]class NumericLabelTensorizer(Tensorizer): """Numberize numeric labels."""
[docs] class Config(Tensorizer.Config): #: The name of the label column to parse from the data source. column: str = "label" #: If provided, the range of values the raw label can be. Will rescale the #: label values to be within [0, 1]. rescale_range: Optional[List[float]] = None # Indicate if it can be used to generate input Tensors for prediction is_input: bool = False
[docs] @classmethod def from_config(cls, config: Config): return cls(config.column, config.rescale_range, config.is_input)
def __init__( self, label_column: str = Config.column, rescale_range: Optional[List[float]] = Config.rescale_range, is_input: bool = Config.is_input, ): self.label_column = label_column if rescale_range is not None: assert len(rescale_range) == 2 assert rescale_range[0] < rescale_range[1] self.rescale_range = rescale_range super().__init__(is_input) @property def column_schema(self): return [(self.label_column, str)]
[docs] def numberize(self, row): """Numberize labels.""" label = float(row[self.label_column]) if self.rescale_range is not None: label -= self.rescale_range[0] label /= self.rescale_range[1] - self.rescale_range[0] assert 0 <= label <= 1 return label
[docs] def tensorize(self, batch): return pad_and_tensorize(batch, dtype=torch.float)
[docs]class FloatListTensorizer(Tensorizer): """Numberize numeric labels."""
[docs] class Config(Tensorizer.Config): #: The name of the label column to parse from the data source. column: str error_check: bool = False dim: Optional[int] = None # If you wish to normalize the training data here, you probably also # want to normalize the inference data. This is currently supported with # TorchScript models (see DocModel). See T48207828 for progress on # supporting Caffe2 models. normalize: bool = False
[docs] @classmethod def from_config(cls, config: Config): return cls( config.column, config.error_check, config.dim, config.normalize, config.is_input, )
def __init__( self, column: str, error_check: bool, dim: Optional[int], normalize: bool, is_input: bool = Config.is_input, ): self.column = column self.error_check = error_check self.dim = dim assert not normalize or self.dim is not None, "Normalization requires dim" assert not self.error_check or self.dim is not None, "Error check requires dim" # If normalize and error_check both are false and dim is still None, set # it to 0 so that it can successfully create VectorNormalizer if dim is None: dim = 0 self.normalizer = VectorNormalizer(dim, normalize) super().__init__(is_input) @property def column_schema(self): return [(self.column, List[float])]
[docs] def initialize(self): if not self.normalizer.do_normalization: self.normalizer.calculate_feature_stats() return try: while True: row = yield res = row[self.column] self.normalizer.update_meta_data(res) except GeneratorExit: self.normalizer.calculate_feature_stats()
[docs] def numberize(self, row): dense = row[self.column] if self.error_check: assert ( len(dense) == self.dim ), f"Dense feature didn't match expected dimension {self.dim}: {dense}" return self.normalizer.normalize([dense])[0]
[docs] def tensorize(self, batch): # training in fp16 will pad tensor shape to multiple of 8 unless # explicitly specify pad_shape to avoid padding. pad_shape = (len(batch), self.dim) if self.dim else None return maybe_half( pad_and_tensorize(batch, dtype=torch.float, pad_shape=pad_shape) )
NO_LABEL = constants.Token("NoLabel")
[docs]class SlotLabelTensorizer(Tensorizer): """Numberize word/slot labels."""
[docs] class Config(Tensorizer.Config): #: The name of the slot label column to parse from the data source. slot_column: str = "slots" #: The name of the text column to parse from the data source. #: We need this to be able to generate tensors which correspond to input text. text_column: str = "text" #: The tokenizer to use to split input text into tokens. This should be #: configured in a way which yields tokens consistent with the tokens input to #: or output by a model, so that the labels generated by this tensorizer #: will match the indices of the model's tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() #: Whether to allow for unknown labels at test/prediction time allow_unknown: bool = False # Indicate if it can be used to generate input Tensors for prediction is_input: bool = False
[docs] @classmethod def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls( config.slot_column, config.text_column, tokenizer, config.allow_unknown, config.is_input, )
def __init__( self, slot_column: str = Config.slot_column, text_column: str = Config.text_column, tokenizer: Tokenizer = None, allow_unknown: bool = Config.allow_unknown, is_input: bool = Config.is_input, ): self.slot_column = slot_column self.text_column = text_column self.allow_unknown = allow_unknown self.tokenizer = tokenizer or Tokenizer() self.pad_idx = Padding.DEFAULT_LABEL_PAD_IDX self.vocab_builder = VocabBuilder() self.vocab_builder.add(NO_LABEL) self.vocab_builder.use_pad = False self.vocab_builder.use_unk = self.allow_unknown self.vocab = None super().__init__(is_input) @property def column_schema(self): return [(self.text_column, str), (self.slot_column, List[Slot])]
[docs] def initialize(self, from_scratch=True): """Look through the dataset for all labels and create a vocab map for them.""" if self.vocab and from_scratch: return try: while True: row = yield slots = row[self.slot_column] self.vocab_builder.add_all(s.label for s in slots) except GeneratorExit: self.vocab = self.vocab_builder.make_vocab()
[docs] def numberize(self, row): """ Turn slot labels and text into a list of token labels with the same length as the number of tokens in the text. """ slots = row[self.slot_column] text = row[self.text_column] tokens = self.tokenizer.tokenize(text) indexed_tokens = tokens labels = [] current_slot = 0 current_token = 0 while current_token < len(tokens) and current_slot < len(slots): _, start, end = indexed_tokens[current_token] slot = slots[current_slot] if start > slot.end: current_slot += 1 else: current_token += 1 labels.append(slot.label if end > slot.start else NO_LABEL) labels += [NO_LABEL] * (len(tokens) - current_token) return self.vocab.lookup_all(labels)
[docs] def tensorize(self, batch): return pad_and_tensorize(batch, dtype=torch.long)
[docs]class SlotLabelTensorizerExpansible(SlotLabelTensorizer): """Create a base SlotLabelTensorizer to support selecting different types in ModelInput.""" __EXPANSIBLE__ = True
[docs]class GazetteerTensorizer(Tensorizer): """ Create 3 tensors for dict features. - idx: index of feature in token order. - weights: weight of feature in token order. - lens: number of features per token. For each input token, there will be the same number of `idx` and `weights` entries. (equal to the max number of features any token has in this row). The values in `lens` will tell how many of these features are actually used per token. Input format for the dict column is json and should be a list of dictionaries containing the "features" and their weight for each relevant "tokenIdx". Example: :: text: "Order coffee from Starbucks please" dict: [ {"tokenIdx": 1, "features": {"drink/beverage": 0.8, "music/song": 0.2}}, {"tokenIdx": 3, "features": {"store/coffee_shop": 1.0}} ] if we assume this vocab :: vocab = { UNK: 0, PAD: 1, "drink/beverage": 2, "music/song": 3, "store/coffee_shop": 4 } this example will result in those tensors: :: idx = [1, 1, 2, 3, 1, 1, 4, 1, 1, 1] weights = [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0] lens = [1, 2, 1, 1, 1] """
[docs] class Config(Tensorizer.Config): text_column: str = "text" dict_column: str = "dict" #: tokenizer to split text and create dict tensors of the same size. tokenizer: Tokenizer.Config = Tokenizer.Config()
[docs] @classmethod def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls(config.text_column, config.dict_column, tokenizer, config.is_input)
def __init__( self, text_column: str = Config.text_column, dict_column: str = Config.dict_column, tokenizer: Tokenizer = None, is_input: bool = Config.is_input, ): self.text_column = text_column self.dict_column = dict_column self.tokenizer = tokenizer or Tokenizer() self.vocab_builder = VocabBuilder() self.vocab = None super().__init__(is_input) @property def column_schema(self): return [(self.text_column, str), (self.dict_column, Gazetteer)]
[docs] def initialize(self, from_scratch=True): """ Look through the dataset for all dict features to create vocab. """ if self.vocab and from_scratch: return try: while True: row = yield for token_dict in row[self.dict_column]: self.vocab_builder.add_all(token_dict["features"]) except GeneratorExit: self.vocab = self.vocab_builder.make_vocab()
[docs] def numberize(self, row): """ Numberize dict features. Fill in for tokens with no features with PAD and weight 0.0. All tokens need to have at least one entry. Tokens with more than one feature will have multiple idx and weight added in sequence. """ num_tokens = len(self.tokenizer.tokenize(row[self.text_column])) num_labels = max(len(t["features"]) for t in row[self.dict_column]) res_idx = [self.vocab.get_pad_index()] * (num_labels * num_tokens) res_weights = [0.0] * (num_labels * num_tokens) res_lens = [1] * num_tokens for dict_feature in row[self.dict_column]: idx = dict_feature["tokenIdx"] feats = dict_feature["features"] pos = idx * num_labels res_lens[idx] = len(feats) # write values at the correct pos for label, weight in feats.items(): res_idx[pos] = self.vocab.lookup_all(label) res_weights[pos] = weight pos += 1 return res_idx, res_weights, res_lens
[docs] def tensorize(self, batch): # Pad a minibatch of dictionary features to be # batch_size * max_number_of_words * max_number_of_features # unpack the minibatch feats, weights, lengths = zip(*batch) lengths_flattened = [li for l_list in lengths for li in l_list] seq_lens = [len(l_list) for l_list in lengths] max_ex_len = precision.pad_length(max(seq_lens)) max_feat_len = max(lengths_flattened) all_lengths, all_feats, all_weights = [], [], [] for i, seq_len in enumerate(seq_lens): ex_feats, ex_weights, ex_lengths = [], [], [] feats_lengths, feats_vals, feats_weights = lengths[i], feats[i], weights[i] max_feat_len_example = max(feats_lengths) r_offset = 0 for _ in feats_lengths: # The dict feats obtained from the featurizer will have necessary # padding at the utterance level. Therefore we move the offset by # max feature length in the example. ex_feats.extend(feats_vals[r_offset : r_offset + max_feat_len_example]) ex_feats.extend( [self.vocab.get_pad_index()] * (max_feat_len - max_feat_len_example) ) ex_weights.extend( feats_weights[r_offset : r_offset + max_feat_len_example] ) ex_weights.extend([0.0] * (max_feat_len - max_feat_len_example)) r_offset += max_feat_len_example ex_lengths.extend(feats_lengths) # Pad examples ex_padding = (max_ex_len - seq_len) * max_feat_len ex_feats.extend([self.vocab.get_pad_index()] * ex_padding) ex_weights.extend([0.0] * ex_padding) ex_lengths.extend([1] * (max_ex_len - seq_len)) all_feats.append(ex_feats) all_weights.append(ex_weights) all_lengths.append(ex_lengths) return ( cuda.tensor(all_feats, torch.long), precision.maybe_half(cuda.tensor(all_weights, torch.float)), cuda.tensor(all_lengths, torch.long), )
[docs]class SeqTokenTensorizer(Tensorizer): """ Tensorize a sequence of sentences. The input is a list of strings, like this one: :: ["where do you wanna meet?", "MPK"] if we assume this vocab :: vocab { UNK: 0, PAD: 1, 'where': 2, 'do': 3, 'you': 4, 'wanna': 5, 'meet?': 6, 'mpk': 7 } this example will result in those tensors: :: idx = [[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]] sentence_len = [5, 1] seq_len = [2] If you're using BOS, EOS, BOL and EOL, the vocab will look like this :: vocab { UNK: 0, PAD: 1, BOS: 2, EOS: 3, BOL: 4, EOL: 5 'where': 6, 'do': 7, 'you': 8, 'wanna': 9, 'meet?': 10, 'mpk': 11 } this example will result in those tensors: :: idx = [ [2, 4, 3, 1, 1, 1, 1], [2, 6, 7, 8, 9, 10, 3], [2, 11, 3, 1, 1, 1, 1], [2, 5, 3, 1, 1, 1, 1] ] sentence_len = [3, 8, 3, 3] seq_len = [4] """
[docs] class Config(Tensorizer.Config): column: str = "text_seq" # this is actually the max token count, it's named seq_len beause the variable is used in _tokenize # function from TokenTensorizer max_seq_len: Optional[int] = None #: sentence markers add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False #: list markers add_bol_token: bool = False add_eol_token: bool = False use_eol_token_for_bol: bool = False #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() # the max number of turns in one example max_turn: int = 50
[docs] @classmethod def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls( column=config.column, tokenizer=tokenizer, add_bos_token=config.add_bos_token, add_eos_token=config.add_eos_token, use_eos_token_for_bos=config.use_eos_token_for_bos, add_bol_token=config.add_bol_token, add_eol_token=config.add_eol_token, use_eol_token_for_bol=config.use_eol_token_for_bol, max_seq_len=config.max_seq_len, is_input=config.is_input, max_turn=config.max_turn, )
def __init__( self, column: str = Config.column, tokenizer=None, add_bos_token: bool = Config.add_bos_token, add_eos_token: bool = Config.add_eos_token, use_eos_token_for_bos: bool = Config.use_eos_token_for_bos, add_bol_token: bool = Config.add_bol_token, add_eol_token: bool = Config.add_eol_token, use_eol_token_for_bol: bool = Config.use_eol_token_for_bol, max_seq_len=Config.max_seq_len, vocab=None, is_input: bool = Config.is_input, max_turn=50, ): self.column = column self.tokenizer = tokenizer or Tokenizer() self.vocab = vocab self.vocab_builder = None self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.add_bol_token = add_bol_token self.add_eol_token = add_eol_token self.use_eol_token_for_bol = use_eol_token_for_bol # this is actually the max token count, it's named seq_len beause the variable is used in _tokenize # function from TokenTensorizer self.max_seq_len = max_seq_len or 2 ** 30 # large number self.max_turn = max_turn super().__init__(is_input) @property def column_schema(self): return [(self.column, List[str])]
[docs] def initialize(self, vocab_builder=None, from_scratch=True): """Build vocabulary based on training corpus.""" if self.vocab and from_scratch: return if not self.vocab_builder: self.vocab_builder = vocab_builder or VocabBuilder() self.vocab_builder.use_bos = self.add_bos_token self.vocab_builder.use_eos = self.add_eos_token self.vocab_builder.use_bol = self.add_bol_token self.vocab_builder.use_eol = self.add_eol_token try: while True: row = yield for raw_text in row[self.column]: tokenized = self.tokenizer.tokenize(raw_text) self.vocab_builder.add_all([t.value for t in tokenized]) except GeneratorExit: self.vocab = self.vocab_builder.make_vocab()
_lookup_tokens = TokenTensorizer._lookup_tokens _tokenize = TokenTensorizer._tokenize
[docs] def numberize(self, row): """Tokenize, look up in vocabulary.""" return self._process(row, raw_token_output=False)
[docs] def prepare_input(self, row): """Tokenize, return tokenized_texts in raw text""" seq, sen_lens, seq_lens = self._process(row, raw_token_output=True) # convert all special tokens to str return [[str(token) for token in sen] for sen in seq], sen_lens, seq_lens
def _process(self, row, raw_token_output): sentence_process_fn = ( self._tokenize if raw_token_output else self._lookup_tokens ) pad_token = ( self.vocab.pad_token if raw_token_output else self.vocab.get_pad_index() ) seq = [] if self.add_bol_token: bol = EOL if self.use_eol_token_for_bol else BOL tokens, _, _ = sentence_process_fn(pre_tokenized=[Token(bol, -1, -1)]) seq.append(list(tokens)) for raw_text in row[self.column][: self.max_turn]: tokens, _, _ = sentence_process_fn(raw_text) seq.append(list(tokens)) if self.add_eol_token: tokens, _, _ = sentence_process_fn(pre_tokenized=[Token(EOL, -1, -1)]) seq.append(list(tokens)) max_len = max(len(sentence) for sentence in seq) sentence_lens = [] for sentence in seq: sen_len = len(sentence) sentence_lens.append(sen_len) pad_len = max_len - sen_len if pad_len: sentence += [pad_token] * pad_len return seq, sentence_lens, len(seq)
[docs] def tensorize(self, batch): tokens, sentence_lens, seq_lens = zip(*batch) return ( pad_and_tensorize(tokens, self.vocab.get_pad_index()), # pad with len of 1, because 0 will cause issue in LSTM pad_and_tensorize(sentence_lens, 1), pad_and_tensorize(seq_lens, 0), )
[docs] def sort_key(self, row): # sort by seq_len first, then max sentence len return row[2] + row[1] / self.max_turn
[docs]class AnnotationNumberizer(Tensorizer): """ Not really a Tensorizer (since it does not create tensors) but technically serves the same function. This class parses Annotations in the format below and extracts the actions (type List[List[int]]) :: [IN:GET_ESTIMATED_DURATION How long will it take to [SL:METHOD_TRAVEL drive ] from [SL:SOURCE Chicago ] to [SL:DESTINATION Mississippi ] ] Extraction algorithm is handled by Annotation class. We only care about the list of actions, which before vocab index lookups would look like: :: [ IN:GET_ESTIMATED_DURATION, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SL:METHOD_TRAVEL, SHIFT, REDUCE, SHIFT, SL:SOURCE, SHIFT, REDUCE, SHIFT, SL:DESTINATION, SHIFT, REDUCE, ] """
[docs] class Config(Tensorizer.Config): column: str = "seqlogical"
[docs] @classmethod def from_config(cls, config: Config): return cls(column=config.column, is_input=config.is_input)
def __init__( self, column: str = Config.column, vocab=None, is_input: bool = Config.is_input ): self.column = column self.vocab = vocab self.vocab_builder = None super().__init__(is_input) @property def column_schema(self): return [(self.column, str)]
[docs] def initialize(self, vocab_builder=None, from_scratch=True): """Build vocabulary based on training corpus.""" if self.vocab and from_scratch: return if not self.vocab_builder: self.vocab_builder = vocab_builder or VocabBuilder() self.vocab_builder.use_unk = False self.vocab_builder.use_pad = False try: while True: row = yield annotation = Annotation(row[self.column]) actions = annotation.tree.to_actions() self.vocab_builder.add_all(actions) except GeneratorExit: self.vocab = self.vocab_builder.make_vocab() self.shift_idx = self.vocab.idx[SHIFT] self.reduce_idx = self.vocab.idx[REDUCE] def filterVocab(fn): return [token for nt, token in self.vocab.idx.items() if fn(nt)] self.ignore_subNTs_roots = filterVocab(is_unsupported) self.valid_NT_idxs = filterVocab(is_valid_nonterminal) self.valid_IN_idxs = filterVocab(is_intent_nonterminal) self.valid_SL_idxs = filterVocab(is_slot_nonterminal)
[docs] def numberize(self, row): """Tokenize, look up in vocabulary.""" annotation = Annotation(row[self.column]) return self.vocab.lookup_all(annotation.tree.to_actions())
[docs] def tensorize(self, batch): return batch
[docs]class MetricTensorizer(Tensorizer): """A tensorizer which use other tensorizers' numerized data. Used mostly for metric reporting."""
[docs] class Config(Tensorizer.Config): names: List[str] indexes: List[int] # Indicate if it can be used to generate input Tensors for prediction is_input: bool = False
[docs] @classmethod def from_config(cls, config: Config): return cls(config.names, config.indexes, config.is_input)
def __init__( self, names: List[str], indexes: List[int], is_input: bool = Config.is_input ): self.names = names self.indexes = indexes super().__init__(is_input)
[docs] def numberize(self, row): # metric tensorizer will depends on other tensorizers' numeric result return None
[docs] def tensorize(self, batch): raise NotImplementedError
[docs]class NtokensTensorizer(MetricTensorizer): """A tensorizer which will reference another tensorizer's numerized data to calculate the num tokens. Used for calculating tokens per second."""
[docs] def tensorize(self, batch): ntokens = 0 for name, index in zip(self.names, self.indexes): ntokens += sum((sample[index] for sample in batch[name])) return ntokens
[docs]class FloatTensorizer(Tensorizer): """A tensorizer for reading in scalars from the data."""
[docs] class Config(Tensorizer.Config): #: The name of the column to parse from the data source. column: str
[docs] @classmethod def from_config(cls, config: Config): return cls(config.column, config.is_input)
def __init__(self, column: str, is_input: bool = Config.is_input): self.column = column super().__init__(is_input) @property def column_schema(self): return [(self.column, float)]
[docs] def numberize(self, row): return row[self.column]
[docs] def tensorize(self, batch): return cuda.tensor(batch, torch.float)
[docs]class FloatListSeqTensorizer(Tensorizer): """Numberize numeric labels.""" __TENSORIZER_SCRIPT_IMPL__ = ScriptFloatListSeqTensorizer
[docs] class Config(Tensorizer.Config): #: The name of the label column to parse from the data source. column: str error_check: bool = False dim: Optional[int] = None pad_token: float = -1.0
[docs] @classmethod def from_config(cls, config: Config): return cls( config.column, config.error_check, config.dim, config.pad_token, config.is_input, )
def __init__( self, column: str, error_check: bool, dim: Optional[int], pad_token: float = Config.pad_token, is_input: bool = Config.is_input, ): self.column = column self.error_check = error_check self.dim = dim self.pad_token = pad_token assert not self.error_check or self.dim is not None, "Error check requires dim" super().__init__(is_input) @property def column_schema(self): return [(self.column, List[List[float]])]
[docs] def numberize(self, row): floatSeq_features = row[self.column] if self.error_check: for dense in floatSeq_features: assert ( len(dense) == self.dim ), f"Dense feature didn't match expected dimension {self.dim}: {dense}" return floatSeq_features, len(floatSeq_features)
[docs] def tensorize(self, batch): float_lists, lens = zip(*batch) padded_and_tensorized_float_lists = pad_and_tensorize( float_lists, pad_token=self.pad_token, dtype=torch.float ) return (padded_and_tensorized_float_lists, pad_and_tensorize(lens))
@lazy_property def tensorizer_script_impl(self): return ScriptFloatListSeqTensorizer(self.pad_token)
[docs]class String2DListTensorizerScriptImpl(TensorizerScriptImpl): def __init__( self, vocab: Vocabulary, ): super().__init__() self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(), )
[docs] def numberize( self, tokens: List[List[str]] ) -> Tuple[List[List[int]], List[int], int]: token_indices: List[List[int]] = self.vocab.lookup_indices_2d(tokens) token_lengths: List[int] = [] for idx in range(len(token_indices)): token_lengths.append(len(token_indices[idx])) return token_indices, token_lengths, len(token_indices)
[docs] def tensorize( self, tokens_3d: List[List[List[int]]], seq_lens_2d: List[List[int]], seq_lens_1d: List[int], ) -> Tuple[torch.Tensor, torch.Tensor]: padded_batch, _ = pad_3d( batch=tokens_3d, tokens_lengths=seq_lens_2d, pad_idx=self.vocab.pad_idx ) return ( torch.tensor(padded_batch, dtype=torch.long), torch.tensor(seq_lens_1d, dtype=torch.long), )
[docs] def forward( self, inputs: List[List[List[str]]] ) -> Tuple[torch.Tensor, torch.Tensor]: tokens_3d: List[List[List[int]]] = [] seq_lens_2d: List[List[int]] = [] seq_lens_1d: List[int] = [] for idx in range(len(inputs)): numberized: Tuple[List[List[int]], List[int], int] = self.numberize( inputs[idx] ) tokens_3d.append(numberized[0]) seq_lens_2d.append(numberized[1]) seq_lens_1d.append(numberized[2]) return self.tensorize(tokens_3d, seq_lens_2d, seq_lens_1d)
[docs]class String2DListTensorizer(Tensorizer): __TENSORIZER_SCRIPT_IMPL__ = String2DListTensorizerScriptImpl
[docs] class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" vocab: VocabConfig = VocabConfig() vocab_file_delimiter: str = " "
[docs] @classmethod def from_config(cls, config: Config): return cls( column=config.column, vocab_config=config.vocab, vocab_file_delimiter=config.vocab_file_delimiter, is_input=config.is_input, )
def __init__( self, column, vocab_config=None, vocab=None, vocab_file_delimiter=" ", is_input=Config.is_input, ): self.column = column self.vocab = vocab self.vocab_builder = None self.vocab_config = vocab_config or VocabConfig() self.vocab_file_delimiter = vocab_file_delimiter super().__init__(is_input) @lazy_property def tensorizer_script_impl(self): return self.__TENSORIZER_SCRIPT_IMPL__(vocab=self.vocab) @property def column_schema(self): return [(self.column, str)]
[docs] def initialize(self, from_scratch=True): self.vocab_builder = VocabBuilder(delimiter=self.vocab_file_delimiter) if self.vocab_config.build_from_data: try: while True: row = yield self.vocab_builder.add_all(chain.from_iterable(row[self.column])) except GeneratorExit: pass self.vocab_builder.truncate_to_vocab_size( self.vocab_config.size_from_data, self.vocab_config.min_counts ) elif self.vocab_config.vocab_files is not None: try: # PyText will call this initializer with all the rows, but we don't actually need that while True: row = yield except GeneratorExit: pass # Okay, we finally got to do our thing for vocab_file in self.vocab_config.vocab_files: with PathManager.open(vocab_file.filepath) as f: self.vocab_builder.add_from_file( f, vocab_file.skip_header_line, vocab_file.lowercase_tokens, vocab_file.size_limit, ) else: raise ValueError( f"To create token tensorizer for '{self.column}', either " f"`build_from_data` or `vocab_files` must be set." ) self.vocab = self.vocab_builder.make_vocab()
[docs] def numberize(self, row): return self.tensorizer_script_impl.numberize(row[self.column])
[docs] def tensorize(self, batch): ( token_indices_tensor, seq_lens_1d, ) = self.tensorizer_script_impl.tensorize_wrapper(*zip(*batch)) return ( cuda.tensor(token_indices_tensor, dtype=torch.long), cuda.tensor(seq_lens_1d, dtype=torch.long), )
[docs]def initialize_tensorizers(tensorizers, data_source, from_scratch=True): """A utility function to stream a data source to the initialize functions of a dict of tensorizers.""" initializers = [] for init in [ tensorizer.initialize(from_scratch=from_scratch) if hasattr(tensorizer, "vocab") else tensorizer.initialize() for tensorizer in tensorizers.values() ]: try: init.send(None) # kick initializers.append(init) except StopIteration: pass if initializers: for row in data_source: for init in initializers: init.send(row)