Source code for pytext.models.word_model

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import Union

from pytext.data.tensorizers import (
    ByteTokenTensorizer,
    SlotLabelTensorizer,
    TokenTensorizer,
)
from pytext.data.utils import UNK
from pytext.exporters.exporter import ModelExporter
from pytext.models.decoders.mlp_decoder import MLPDecoder
from pytext.models.embeddings import CharacterEmbedding, WordEmbedding
from pytext.models.model import Model
from pytext.models.module import create_module
from pytext.models.output_layers import CRFOutputLayer, WordTaggingOutputLayer
from pytext.models.representations.bilstm_slot_attn import BiLSTMSlotAttention
from pytext.models.representations.biseqcnn import BSeqCNNRepresentation
from pytext.models.representations.deepcnn import DeepCNNRepresentation
from pytext.models.representations.pass_through import PassThroughRepresentation


[docs]class WordTaggingModel(Model):
    """
    Word tagging model. It can be used for any task that requires predicting the
    tag for a word/token. For example, the following tasks can be modeled as word
    tagging tasks. This is not an exhaustive list.
    1. Part of speech tagging.
    2. Named entity recognition.
    3. Slot filling for task oriented dialog.

    It can be instantiated just like any other :class:`~Model`.
    """

    __EXPANSIBLE__ = True

[docs]    class Config(Model.Config):
        class ModelInput(Model.Config.ModelInput):
            tokens: TokenTensorizer.Config = TokenTensorizer.Config()
            labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config()

        inputs: ModelInput = ModelInput()
        embedding: WordEmbedding.Config = WordEmbedding.Config()

        representation: Union[
            BiLSTMSlotAttention.Config,  # TODO: make default when sorting solved
            BSeqCNNRepresentation.Config,
            PassThroughRepresentation.Config,
            DeepCNNRepresentation.Config,
        ] = PassThroughRepresentation.Config()
        output_layer: Union[
            WordTaggingOutputLayer.Config, CRFOutputLayer.Config
        ] = WordTaggingOutputLayer.Config()
        decoder: MLPDecoder.Config = MLPDecoder.Config()

[docs]    @classmethod
    def create_embedding(cls, config, tensorizers):
        vocab = tensorizers["tokens"].vocab
        return WordEmbedding(
            len(vocab), config.embedding.embed_dim, None, None, vocab.idx[UNK], []
        )

[docs]    @classmethod
    def from_config(cls, config, tensorizers):
        labels = tensorizers["labels"].vocab
        embedding = cls.create_embedding(config, tensorizers)
        representation = create_module(
            config.representation, embed_dim=embedding.embedding_dim
        )
        decoder = create_module(
            config.decoder,
            in_dim=representation.representation_dim,
            out_dim=len(labels),
        )
        output_layer = create_module(config.output_layer, labels=labels)
        return cls(embedding, representation, decoder, output_layer)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # CRF module has parameters and it's forward function is not called in
        # model's forward function because of ONNX compatibility issue. This will
        # not work with DDP, thus setting find_unused_parameters to False to work
        # around, can be removed once DDP support params not used in model forward
        # function
        if isinstance(self.output_layer, CRFOutputLayer):
            self.find_unused_parameters = False

[docs]    def arrange_model_inputs(self, tensor_dict):
        tokens, seq_lens, _ = tensor_dict["tokens"]
        return (tokens, seq_lens)

[docs]    def arrange_targets(self, tensor_dict):
        return tensor_dict["labels"]

[docs]    def get_export_input_names(self, tensorizers):
        return ["tokens_vals", "tokens_lens"]

[docs]    def get_export_output_names(self, tensorizers):
        return ["word_scores"]

[docs]    def vocab_to_export(self, tensorizers):
        return {"tokens_vals": list(tensorizers["tokens"].vocab)}

[docs]    def arrange_model_context(self, tensor_dict):
        return {"seq_lens": tensor_dict["tokens"][1]}

[docs]    def caffe2_export(self, tensorizers, tensor_dict, path, export_onnx_path=None):
        exporter = ModelExporter(
            ModelExporter.Config(),
            self.get_export_input_names(tensorizers),
            self.arrange_model_inputs(tensor_dict),
            self.vocab_to_export(tensorizers),
            self.get_export_output_names(tensorizers),
        )
        return exporter.export_to_caffe2(self, path, export_onnx_path=export_onnx_path)


[docs]class WordTaggingLiteModel(WordTaggingModel):
    """
    Also a word tagging model, but uses bytes as inputs to the model. Using
    bytes instead of words, the model does not need to store a word embedding
    table mapping words in the vocab to their embedding vector representations,
    but instead compute them on the fly using CharacterEmbedding. This produces
    an exported/serialized model that requires much less storage space as well
    as less memory during run/inference time.
    """

[docs]    class Config(WordTaggingModel.Config):
        class ByteModelInput(Model.Config.ModelInput):
            # We should support characters as well, but CharacterTokenTensorizer
            # does not support adding characters to vocab yet.
            token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config()
            labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config()

        inputs: ByteModelInput = ByteModelInput()
        embedding: CharacterEmbedding.Config = CharacterEmbedding.Config()

[docs]    @classmethod
    def create_embedding(cls, config, tensorizers):
        return CharacterEmbedding(
            tensorizers["token_bytes"].NUM_BYTES,
            config.embedding.embed_dim,
            config.embedding.cnn.kernel_num,
            config.embedding.cnn.kernel_sizes,
            config.embedding.highway_layers,
            config.embedding.projection_dim,
        )

[docs]    def vocab_to_export(self, tensorizers):
        return {}

[docs]    def get_export_input_names(self, tensorizers):
        return ["token_bytes", "token_lens"]

[docs]    def arrange_model_inputs(self, tensor_dict):
        token_bytes, tokens_lens, _ = tensor_dict["token_bytes"]
        return (token_bytes, tokens_lens)

[docs]    def arrange_model_context(self, tensor_dict):
        return {"seq_lens": tensor_dict["token_bytes"][1]}