Source code for pytext.models.word_model

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import List, Optional, Union

import torch
from pytext.common.constants import SpecialTokens
from pytext.data.tensorizers import (
    ByteTokenTensorizer,
    SlotLabelTensorizer,
    TokenTensorizer,
)
from pytext.data.tokenizers import DoNothingTokenizer
from pytext.exporters.exporter import ModelExporter
from pytext.models.decoders.mlp_decoder import MLPDecoder
from pytext.models.embeddings import CharacterEmbedding, WordEmbedding
from pytext.models.model import Model
from pytext.models.module import create_module
from pytext.models.output_layers import CRFOutputLayer, WordTaggingOutputLayer
from pytext.models.representations.bilstm_slot_attn import BiLSTMSlotAttention
from pytext.models.representations.biseqcnn import BSeqCNNRepresentation
from pytext.models.representations.deepcnn import DeepCNNRepresentation
from pytext.models.representations.pass_through import PassThroughRepresentation
from pytext.torchscript.utils import (
    make_byte_inputs,
    make_sequence_lengths,
    pad_2d,
    truncate_tokens,
)
from pytext.torchscript.vocab import ScriptVocabulary
from pytext.utils.usage import log_class_usage


[docs]class WordTaggingModel(Model):
    """
    Word tagging model. It can be used for any task that requires predicting the
    tag for a word/token. For example, the following tasks can be modeled as word
    tagging tasks. This is not an exhaustive list.
    1. Part of speech tagging.
    2. Named entity recognition.
    3. Slot filling for task oriented dialog.

    It can be instantiated just like any other :class:`~Model`.
    """

    __EXPANSIBLE__ = True

[docs]    class Config(Model.Config):
        class ModelInput(Model.Config.ModelInput):
            tokens: TokenTensorizer.Config = TokenTensorizer.Config()
            labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config()

        inputs: ModelInput = ModelInput()
        embedding: WordEmbedding.Config = WordEmbedding.Config()

        representation: Union[
            BiLSTMSlotAttention.Config,  # TODO: make default when sorting solved
            BSeqCNNRepresentation.Config,
            PassThroughRepresentation.Config,
            DeepCNNRepresentation.Config,
        ] = PassThroughRepresentation.Config()
        output_layer: Union[
            WordTaggingOutputLayer.Config, CRFOutputLayer.Config
        ] = WordTaggingOutputLayer.Config()
        decoder: MLPDecoder.Config = MLPDecoder.Config()

[docs]    @classmethod
    def create_embedding(cls, config, tensorizers):
        vocab = tensorizers["tokens"].vocab
        return WordEmbedding(
            len(vocab),
            config.embedding.embed_dim,
            None,
            None,
            vocab.idx[SpecialTokens.UNK],
            [],
        )

[docs]    @classmethod
    def from_config(cls, config, tensorizers):
        labels = tensorizers["labels"].vocab
        embedding = cls.create_embedding(config, tensorizers)
        representation = create_module(
            config.representation, embed_dim=embedding.embedding_dim
        )
        decoder = create_module(
            config.decoder,
            in_dim=representation.representation_dim,
            out_dim=len(labels),
        )
        output_layer = create_module(config.output_layer, labels=labels)
        return cls(embedding, representation, decoder, output_layer)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # CRF module has parameters and it's forward function is not called in
        # model's forward function because of ONNX compatibility issue. This will
        # not work with DDP, thus setting find_unused_parameters to False to work
        # around, can be removed once DDP support params not used in model forward
        # function
        if isinstance(self.output_layer, CRFOutputLayer):
            self.find_unused_parameters = False
        log_class_usage(__class__)

[docs]    def arrange_model_inputs(self, tensor_dict):
        tokens, seq_lens, _ = tensor_dict["tokens"]
        return (tokens, seq_lens)

[docs]    def arrange_targets(self, tensor_dict):
        return tensor_dict["labels"]

[docs]    def get_export_input_names(self, tensorizers):
        return ["tokens_vals", "tokens_lens"]

[docs]    def get_export_output_names(self, tensorizers):
        return ["word_scores"]

[docs]    def vocab_to_export(self, tensorizers):
        return {"tokens_vals": list(tensorizers["tokens"].vocab)}

[docs]    def arrange_model_context(self, tensor_dict):
        return {"seq_lens": tensor_dict["tokens"][1]}

[docs]    def torchscriptify(self, tensorizers, traced_model):
        output_layer = self.output_layer.torchscript_predictions()

        input_vocab = tensorizers["tokens"].vocab
        max_seq_len = tensorizers["tokens"].max_seq_len or -1
        scripted_tokenizer: Optional[torch.jit.ScriptModule] = None
        try:
            scripted_tokenizer = tensorizers["tokens"].tokenizer.torchscriptify()
        except NotImplementedError:
            pass
        if scripted_tokenizer and isinstance(scripted_tokenizer, DoNothingTokenizer):
            scripted_tokenizer = None

        """
        The input tensor packing memory is allocated/cached for different shapes,
        and max sequence length will help to reduce the number of different tensor
        shapes. We noticed that the TorchScript model could use 25G for offline
        inference on CPU without using max_seq_len.
        """

        class Model(torch.jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = ScriptVocabulary(
                    input_vocab,
                    input_vocab.get_unk_index(),
                    input_vocab.get_pad_index(),
                )
                self.model = traced_model
                self.output_layer = output_layer
                self.pad_idx = torch.jit.Attribute(input_vocab.get_pad_index(), int)
                self.max_seq_len = torch.jit.Attribute(max_seq_len, int)
                self.tokenizer = scripted_tokenizer

            @torch.jit.script_method
            def forward(
                self,
                texts: Optional[List[str]] = None,
                multi_texts: Optional[List[List[str]]] = None,
                tokens: Optional[List[List[str]]] = None,
                languages: Optional[List[str]] = None,
            ):
                # PyTorch breaks with 2 'not None' checks right now.
                if texts is not None:
                    if tokens is not None:
                        raise RuntimeError("Can't set both tokens and texts")
                    if self.tokenizer is not None:
                        tokens = [
                            [t[0] for t in self.tokenizer.tokenize(text)]
                            for text in texts
                        ]

                if tokens is None:
                    raise RuntimeError("tokens is required")

                tokens = truncate_tokens(tokens, self.max_seq_len, self.vocab.pad_token)
                seq_lens = make_sequence_lengths(tokens)
                word_ids = self.vocab.lookup_indices_2d(tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                logits = self.model(torch.tensor(word_ids), torch.tensor(seq_lens))
                return self.output_layer(logits)

        return Model()


[docs]class WordTaggingLiteModel(WordTaggingModel):
    """
    Also a word tagging model, but uses bytes as inputs to the model. Using
    bytes instead of words, the model does not need to store a word embedding
    table mapping words in the vocab to their embedding vector representations,
    but instead compute them on the fly using CharacterEmbedding. This produces
    an exported/serialized model that requires much less storage space as well
    as less memory during run/inference time.
    """

[docs]    class Config(WordTaggingModel.Config):
        class ByteModelInput(Model.Config.ModelInput):
            # We should support characters as well, but CharacterTokenTensorizer
            # does not support adding characters to vocab yet.
            token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config()
            labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config()

        inputs: ByteModelInput = ByteModelInput()
        embedding: CharacterEmbedding.Config = CharacterEmbedding.Config()

[docs]    @classmethod
    def create_embedding(cls, config, tensorizers):
        return CharacterEmbedding(
            tensorizers["token_bytes"].NUM_BYTES,
            config.embedding.embed_dim,
            config.embedding.cnn.kernel_num,
            config.embedding.cnn.kernel_sizes,
            config.embedding.highway_layers,
            config.embedding.projection_dim,
        )

[docs]    def vocab_to_export(self, tensorizers):
        return {}

[docs]    def get_export_input_names(self, tensorizers):
        return ["token_bytes", "token_lens"]

[docs]    def arrange_model_inputs(self, tensor_dict):
        token_bytes, tokens_lens, _ = tensor_dict["token_bytes"]
        return (token_bytes, tokens_lens)

[docs]    def arrange_model_context(self, tensor_dict):
        return {"seq_lens": tensor_dict["token_bytes"][1]}

[docs]    def torchscriptify(self, tensorizers, traced_model):
        output_layer = self.output_layer.torchscript_predictions()
        max_seq_len = tensorizers["token_bytes"].max_seq_len or -1
        max_byte_len = tensorizers["token_bytes"].max_byte_len
        byte_offset_for_non_padding = tensorizers["token_bytes"].offset_for_non_padding

        class Model(torch.jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.max_seq_len = torch.jit.Attribute(max_seq_len, int)
                self.max_byte_len = torch.jit.Attribute(max_byte_len, int)
                self.byte_offset_for_non_padding = torch.jit.Attribute(
                    byte_offset_for_non_padding, int
                )
                self.model = traced_model
                self.output_layer = output_layer

            @torch.jit.script_method
            def forward(
                self,
                texts: Optional[List[str]] = None,
                multi_texts: Optional[List[List[str]]] = None,
                tokens: Optional[List[List[str]]] = None,
                languages: Optional[List[str]] = None,
            ):
                if tokens is None:
                    raise RuntimeError("tokens is required")

                tokens = truncate_tokens(tokens, self.max_seq_len, SpecialTokens.PAD)
                seq_lens = make_sequence_lengths(tokens)
                token_bytes, _ = make_byte_inputs(
                    tokens, self.max_byte_len, self.byte_offset_for_non_padding
                )
                logits = self.model(token_bytes, torch.tensor(seq_lens))
                return self.output_layer(logits)

        return Model()