Source code for pytext.models.doc_model

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import Dict, List, Optional, Union

import torch
from pytext.config.component import create_loss
from pytext.data.tensorizers import (
    ByteTokenTensorizer,
    FloatListTensorizer,
    LabelTensorizer,
    NumericLabelTensorizer,
    Tensorizer,
    TokenTensorizer,
    UidTensorizer,
)
from pytext.data.utils import PAD, UNK
from pytext.exporters.exporter import ModelExporter
from pytext.loss import BinaryCrossEntropyLoss, MultiLabelSoftMarginLoss
from pytext.models.decoders.mlp_decoder import DecoderBase, MLPDecoder
from pytext.models.embeddings import (
    CharacterEmbedding,
    EmbeddingBase,
    EmbeddingList,
    WordEmbedding,
)
from pytext.models.model import Model
from pytext.models.module import create_module
from pytext.models.output_layers import (
    ClassificationOutputLayer,
    OutputLayerBase,
    RegressionOutputLayer,
)
from pytext.models.output_layers.doc_classification_output_layer import (
    BinaryClassificationOutputLayer,
    MulticlassOutputLayer,
    MultiLabelOutputLayer,
)
from pytext.models.representations.bilstm_doc_attention import BiLSTMDocAttention
from pytext.models.representations.deepcnn import DeepCNNRepresentation
from pytext.models.representations.docnn import DocNNRepresentation
from pytext.models.representations.pure_doc_attention import PureDocAttention
from pytext.models.representations.representation_base import RepresentationBase
from pytext.torchscript.utils import make_byte_inputs, make_sequence_lengths, pad_2d
from pytext.torchscript.vocab import ScriptVocabulary
from pytext.utils.label import get_label_weights
from torch import jit


[docs]class DocModel(Model): """DocModel that's compatible with the new Model abstraction, which is responsible for describing which inputs it expects and arranging its input tensors.""" __EXPANSIBLE__ = True
[docs] class Config(Model.Config): class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() dense: Optional[FloatListTensorizer.Config] = None labels: LabelTensorizer.Config = LabelTensorizer.Config() inputs: ModelInput = ModelInput() embedding: WordEmbedding.Config = WordEmbedding.Config() representation: Union[ PureDocAttention.Config, BiLSTMDocAttention.Config, DocNNRepresentation.Config, DeepCNNRepresentation.Config, ] = BiLSTMDocAttention.Config() decoder: MLPDecoder.Config = MLPDecoder.Config() output_layer: ClassificationOutputLayer.Config = ( ClassificationOutputLayer.Config() )
[docs] def arrange_model_inputs(self, tensor_dict): tokens, seq_lens, _ = tensor_dict["tokens"] model_inputs = (tokens, seq_lens) if "dense" in tensor_dict: model_inputs += (tensor_dict["dense"],) return model_inputs
[docs] def arrange_targets(self, tensor_dict): return tensor_dict["labels"]
[docs] def get_export_input_names(self, tensorizers): res = ["tokens_vals", "tokens_lens"] if "dense" in tensorizers: res += ["float_vec_vals"] return res
[docs] def get_export_output_names(self, tensorizers): return ["scores"]
[docs] def vocab_to_export(self, tensorizers): return {"tokens_vals": list(tensorizers["tokens"].vocab)}
[docs] def caffe2_export(self, tensorizers, tensor_dict, path, export_onnx_path=None): exporter = ModelExporter( ModelExporter.Config(), self.get_export_input_names(tensorizers), self.arrange_model_inputs(tensor_dict), self.vocab_to_export(tensorizers), self.get_export_output_names(tensorizers), ) return exporter.export_to_caffe2(self, path, export_onnx_path=export_onnx_path)
[docs] def torchscriptify(self, tensorizers, traced_model): output_layer = self.output_layer.torchscript_predictions() input_vocab = tensorizers["tokens"].vocab class Model(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) @jit.script_method def forward(self, tokens: List[List[str]]): seq_lens = make_sequence_lengths(tokens) word_ids = self.vocab.lookup_indices_2d(tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) logits = self.model(torch.tensor(word_ids), torch.tensor(seq_lens)) return self.output_layer(logits) class ModelWithDenseFeat(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.normalizer = tensorizers["dense"].normalizer self.model = traced_model self.output_layer = output_layer self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) @jit.script_method def forward(self, tokens: List[List[str]], dense_feat: List[List[float]]): seq_lens = make_sequence_lengths(tokens) word_ids = self.vocab.lookup_indices_2d(tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) dense_feat = self.normalizer.normalize(dense_feat) logits = self.model( torch.tensor(word_ids), torch.tensor(seq_lens), torch.tensor(dense_feat, dtype=torch.float), ) return self.output_layer(logits) return ModelWithDenseFeat() if "dense" in tensorizers else Model()
[docs] @classmethod def create_embedding(cls, config: Config, tensorizers: Dict[str, Tensorizer]): return create_module( config.embedding, tensorizer=tensorizers["tokens"], init_from_saved_state=config.init_from_saved_state, )
[docs] @classmethod def create_decoder(cls, config: Config, representation_dim: int, num_labels: int): num_decoder_modules = 0 in_dim = representation_dim if hasattr(config.inputs, "dense") and config.inputs.dense: num_decoder_modules += 1 in_dim += config.inputs.dense.dim decoder = create_module(config.decoder, in_dim=in_dim, out_dim=num_labels) decoder.num_decoder_modules = num_decoder_modules return decoder
[docs] @classmethod def from_config(cls, config: Config, tensorizers: Dict[str, Tensorizer]): labels = tensorizers["labels"].vocab embedding = cls.create_embedding(config, tensorizers) representation = create_module( config.representation, embed_dim=embedding.embedding_dim ) decoder = cls.create_decoder( config, representation.representation_dim, len(labels) ) label_weights = ( get_label_weights(labels.idx, config.output_layer.label_weights) if config.output_layer.label_weights else None ) loss = create_loss(config.output_layer.loss, weight=label_weights) if isinstance(loss, BinaryCrossEntropyLoss): output_layer_cls = BinaryClassificationOutputLayer elif isinstance(loss, MultiLabelSoftMarginLoss): output_layer_cls = MultiLabelOutputLayer else: output_layer_cls = MulticlassOutputLayer output_layer = output_layer_cls(list(labels), loss) return cls(embedding, representation, decoder, output_layer)
[docs]class ByteTokensDocumentModel(DocModel): """ DocModel that receives both word IDs and byte IDs as inputs (concatenating word and byte-token embeddings to represent input tokens). """
[docs] class Config(DocModel.Config): class ByteModelInput(DocModel.Config.ModelInput): token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config() inputs: ByteModelInput = ByteModelInput() byte_embedding: CharacterEmbedding.Config = CharacterEmbedding.Config()
[docs] @classmethod def create_embedding(cls, config, tensorizers: Dict[str, Tensorizer]): word_tensorizer = config.inputs.tokens byte_tensorizer = config.inputs.token_bytes assert word_tensorizer.column == byte_tensorizer.column word_embedding = create_module( config.embedding, tensorizer=tensorizers["tokens"], init_from_saved_state=config.init_from_saved_state, ) byte_embedding = create_module( config.byte_embedding, vocab_size=ByteTokenTensorizer.NUM_BYTES ) return EmbeddingList([word_embedding, byte_embedding], concat=True)
[docs] def arrange_model_inputs(self, tensor_dict): tokens, seq_lens, _ = tensor_dict["tokens"] token_bytes, byte_seq_lens, _ = tensor_dict["token_bytes"] assert (seq_lens == byte_seq_lens).all().item() model_inputs = tokens, token_bytes, seq_lens if "dense" in tensor_dict: model_inputs += (tensor_dict["dense"],) return model_inputs
[docs] def get_export_input_names(self, tensorizers): names = ["tokens_vals", "token_bytes", "tokens_lens"] if "dense" in tensorizers: names.append("float_vec_vals") return names
[docs] def torchscriptify(self, tensorizers, traced_model): output_layer = self.output_layer.torchscript_predictions() max_byte_len = tensorizers["token_bytes"].max_byte_len byte_offset_for_non_padding = tensorizers["token_bytes"].offset_for_non_padding input_vocab = tensorizers["tokens"].vocab class Model(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.max_byte_len = jit.Attribute(max_byte_len, int) self.byte_offset_for_non_padding = jit.Attribute( byte_offset_for_non_padding, int ) self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) self.model = traced_model self.output_layer = output_layer @jit.script_method def forward(self, tokens: List[List[str]]): seq_lens = make_sequence_lengths(tokens) word_ids = self.vocab.lookup_indices_2d(tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) token_bytes, _ = make_byte_inputs( tokens, self.max_byte_len, self.byte_offset_for_non_padding ) logits = self.model( torch.tensor(word_ids), token_bytes, torch.tensor(seq_lens) ) return self.output_layer(logits) class ModelWithDenseFeat(jit.ScriptModule): def __init__(self): super().__init__() self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK]) self.normalizer = tensorizers["dense"].normalizer self.max_byte_len = jit.Attribute(max_byte_len, int) self.byte_offset_for_non_padding = jit.Attribute( byte_offset_for_non_padding, int ) self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int) self.model = traced_model self.output_layer = output_layer @jit.script_method def forward(self, tokens: List[List[str]], dense_feat: List[List[float]]): seq_lens = make_sequence_lengths(tokens) word_ids = self.vocab.lookup_indices_2d(tokens) word_ids = pad_2d(word_ids, seq_lens, self.pad_idx) token_bytes, _ = make_byte_inputs( tokens, self.max_byte_len, self.byte_offset_for_non_padding ) dense_feat = self.normalizer.normalize(dense_feat) logits = self.model( torch.tensor(word_ids), token_bytes, torch.tensor(seq_lens), torch.tensor(dense_feat, dtype=torch.float), ) return self.output_layer(logits) return ModelWithDenseFeat() if "dense" in tensorizers else Model()
[docs]class DocRegressionModel(DocModel): """ Model that's compatible with the new Model abstraction, and is configured for regression tasks (specifically for labels, predictions, and loss). """
[docs] class Config(DocModel.Config): class RegressionModelInput(DocModel.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: NumericLabelTensorizer.Config = NumericLabelTensorizer.Config() inputs: RegressionModelInput = RegressionModelInput() output_layer: RegressionOutputLayer.Config = RegressionOutputLayer.Config()
[docs] @classmethod def from_config(cls, config: Config, tensorizers: Dict[str, Tensorizer]): embedding = cls.create_embedding(config, tensorizers) representation = create_module( config.representation, embed_dim=embedding.embedding_dim ) decoder = create_module( config.decoder, in_dim=representation.representation_dim, out_dim=1 ) output_layer = RegressionOutputLayer.from_config(config.output_layer) return cls(embedding, representation, decoder, output_layer)
[docs]class PersonalizedDocModel(DocModel): """ DocModel that includes a user embedding which learns user features to produce personalized prediction. In this class, user-embedding is fed directly to the decoder (i.e., does not go through the encoders). """
[docs] class Config(DocModel.Config): class PersonalizedModelInput(DocModel.Config.ModelInput): uid: Optional[UidTensorizer.Config] = UidTensorizer.Config() inputs: PersonalizedModelInput = PersonalizedModelInput() # user_embedding is a representation for a user and is jointly trained # with the model. Consider user ids as "words" to reuse WordEmbedding class. user_embedding: WordEmbedding.Config = WordEmbedding.Config()
[docs] @classmethod def from_config(cls, config, tensorizers: Dict[str, Tensorizer]): model = super().from_config(config, tensorizers) user_embedding = create_module( config.user_embedding, tensorizer=tensorizers["uid"], init_from_saved_state=config.init_from_saved_state, ) # Init user embeddings to be a same vector because we assume user features # are not too different from each other. emb_shape = user_embedding.word_embedding.weight.data.shape with torch.no_grad(): user_embedding.word_embedding.weight.copy_( torch.rand(emb_shape[1]).repeat(emb_shape[0], 1) ) labels = tensorizers["labels"].vocab decoder = cls.create_decoder( config, model.representation.representation_dim + user_embedding.embedding_dim, len(labels), ) return cls( model.embedding, model.representation, decoder, model.output_layer, user_embedding, )
def __init__( self, embedding: EmbeddingBase, representation: RepresentationBase, decoder: DecoderBase, output_layer: OutputLayerBase, user_embedding: Optional[EmbeddingBase] = None, ): super().__init__(embedding, representation, decoder, output_layer) self.user_embedding = user_embedding
[docs] def arrange_model_inputs(self, tensor_dict): model_inputs = super().arrange_model_inputs(tensor_dict) model_inputs += (tensor_dict["uid"],) return model_inputs
[docs] def vocab_to_export(self, tensorizers): export_vocab = super().vocab_to_export(tensorizers) export_vocab["uid"] = list(tensorizers["uid"].vocab) return export_vocab
[docs] def get_export_input_names(self, tensorizers): export_inputs_names = super().get_export_input_names(tensorizers) export_inputs_names += ["uid"] return export_inputs_names
[docs] def torchscriptify(self, tensorizers, traced_model): raise NotImplementedError("This model is not jittable yet.")
[docs] def forward(self, *inputs) -> List[torch.Tensor]: # Override forward() to include user_embedding layer explictily to the # computation of the forward propagation. embedding_input = inputs[: self.embedding.num_emb_modules] token_emb = self.embedding(*embedding_input) inputs, user_ids = inputs[:-1], inputs[-1] other_input = inputs[ self.embedding.num_emb_modules : len(inputs) - self.decoder.num_decoder_modules ] input_representation = self.representation(token_emb, *other_input) # Some LSTM-based representations return states as (h0, c0). if isinstance(input_representation[-1], tuple): input_representation = input_representation[0] user_embeddings = self.user_embedding(user_ids) # Reduce dim from (batch_size, 1, emb_dim) to (batch_size, emb_dim). if user_embeddings.dim() == 3: user_embeddings = user_embeddings.squeeze(dim=1) decoder_inputs: tuple = () if self.decoder.num_decoder_modules: decoder_inputs = inputs[-self.decoder.num_decoder_modules :] return self.decoder( input_representation, user_embeddings, *decoder_inputs ) # Return Tensor's dim = (batch_size, num_classes).