#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import Union
from pytext.data.tensorizers import (
ByteTokenTensorizer,
SlotLabelTensorizer,
TokenTensorizer,
)
from pytext.data.utils import UNK
from pytext.exporters.exporter import ModelExporter
from pytext.models.decoders.mlp_decoder import MLPDecoder
from pytext.models.embeddings import CharacterEmbedding, WordEmbedding
from pytext.models.model import Model
from pytext.models.module import create_module
from pytext.models.output_layers import CRFOutputLayer, WordTaggingOutputLayer
from pytext.models.representations.bilstm_slot_attn import BiLSTMSlotAttention
from pytext.models.representations.biseqcnn import BSeqCNNRepresentation
from pytext.models.representations.deepcnn import DeepCNNRepresentation
from pytext.models.representations.pass_through import PassThroughRepresentation
[docs]class WordTaggingModel(Model):
"""
Word tagging model. It can be used for any task that requires predicting the
tag for a word/token. For example, the following tasks can be modeled as word
tagging tasks. This is not an exhaustive list.
1. Part of speech tagging.
2. Named entity recognition.
3. Slot filling for task oriented dialog.
It can be instantiated just like any other :class:`~Model`.
"""
__EXPANSIBLE__ = True
[docs] class Config(Model.Config):
class ModelInput(Model.Config.ModelInput):
tokens: TokenTensorizer.Config = TokenTensorizer.Config()
labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config()
inputs: ModelInput = ModelInput()
embedding: WordEmbedding.Config = WordEmbedding.Config()
representation: Union[
BiLSTMSlotAttention.Config, # TODO: make default when sorting solved
BSeqCNNRepresentation.Config,
PassThroughRepresentation.Config,
DeepCNNRepresentation.Config,
] = PassThroughRepresentation.Config()
output_layer: Union[
WordTaggingOutputLayer.Config, CRFOutputLayer.Config
] = WordTaggingOutputLayer.Config()
decoder: MLPDecoder.Config = MLPDecoder.Config()
[docs] @classmethod
def create_embedding(cls, config, tensorizers):
vocab = tensorizers["tokens"].vocab
return WordEmbedding(
len(vocab), config.embedding.embed_dim, None, None, vocab.idx[UNK], []
)
[docs] @classmethod
def from_config(cls, config, tensorizers):
labels = tensorizers["labels"].vocab
embedding = cls.create_embedding(config, tensorizers)
representation = create_module(
config.representation, embed_dim=embedding.embedding_dim
)
decoder = create_module(
config.decoder,
in_dim=representation.representation_dim,
out_dim=len(labels),
)
output_layer = create_module(config.output_layer, labels=labels)
return cls(embedding, representation, decoder, output_layer)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# CRF module has parameters and it's forward function is not called in
# model's forward function because of ONNX compatibility issue. This will
# not work with DDP, thus setting find_unused_parameters to False to work
# around, can be removed once DDP support params not used in model forward
# function
if isinstance(self.output_layer, CRFOutputLayer):
self.find_unused_parameters = False
[docs] def arrange_targets(self, tensor_dict):
return tensor_dict["labels"]
[docs] def get_export_output_names(self, tensorizers):
return ["word_scores"]
[docs] def vocab_to_export(self, tensorizers):
return {"tokens_vals": list(tensorizers["tokens"].vocab)}
[docs] def arrange_model_context(self, tensor_dict):
return {"seq_lens": tensor_dict["tokens"][1]}
[docs] def caffe2_export(self, tensorizers, tensor_dict, path, export_onnx_path=None):
exporter = ModelExporter(
ModelExporter.Config(),
self.get_export_input_names(tensorizers),
self.arrange_model_inputs(tensor_dict),
self.vocab_to_export(tensorizers),
self.get_export_output_names(tensorizers),
)
return exporter.export_to_caffe2(self, path, export_onnx_path=export_onnx_path)
[docs]class WordTaggingLiteModel(WordTaggingModel):
"""
Also a word tagging model, but uses bytes as inputs to the model. Using
bytes instead of words, the model does not need to store a word embedding
table mapping words in the vocab to their embedding vector representations,
but instead compute them on the fly using CharacterEmbedding. This produces
an exported/serialized model that requires much less storage space as well
as less memory during run/inference time.
"""
[docs] class Config(WordTaggingModel.Config):
class ByteModelInput(Model.Config.ModelInput):
# We should support characters as well, but CharacterTokenTensorizer
# does not support adding characters to vocab yet.
token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config()
labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config()
inputs: ByteModelInput = ByteModelInput()
embedding: CharacterEmbedding.Config = CharacterEmbedding.Config()
[docs] @classmethod
def create_embedding(cls, config, tensorizers):
return CharacterEmbedding(
tensorizers["token_bytes"].NUM_BYTES,
config.embedding.embed_dim,
config.embedding.cnn.kernel_num,
config.embedding.cnn.kernel_sizes,
config.embedding.highway_layers,
config.embedding.projection_dim,
)
[docs] def vocab_to_export(self, tensorizers):
return {}
[docs] def arrange_model_context(self, tensor_dict):
return {"seq_lens": tensor_dict["token_bytes"][1]}