Source code for pytext.models.embeddings.dict_embedding

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import Optional

import torch
import torch.nn as nn
import torch.onnx.operators
from pytext.config.field_config import DictFeatConfig
from pytext.config.module_config import PoolingType
from pytext.data.tensorizers import Tensorizer
from pytext.data.utils import PAD_INDEX, UNK_INDEX, Vocabulary
from pytext.fields import FieldMeta
from pytext.utils.usage import log_class_usage

from .embedding_base import EmbeddingBase


[docs]class DictEmbedding(EmbeddingBase): """ Module for dictionary feature embeddings for tokens. Dictionary features are also known as gazetteer features. These are per token discrete features that the module learns embeddings for. Example: For the utterance *Order coffee from Starbucks*, the dictionary features could be :: [ {"tokenIdx": 1, "features": {"drink/beverage": 0.8, "music/song": 0.2}}, {"tokenIdx": 3, "features": {"store/coffee_shop": 1.0}} ] :: Thus, for a given token there can be more than one dictionary features each of which has a confidence score. The final embedding for a token is the weighted average of the dictionary embeddings followed by a pooling operation such that the module produces an embedding vector per token. Args: num_embeddings (int): Total number of dictionary features (vocabulary size). embed_dim (int): Size of embedding vector. pooling_type (PoolingType): Type of pooling for combining the dictionary feature embeddings. Attributes: pooling_type (PoolingType): Type of pooling for combining the dictionary feature embeddings. """ Config = DictFeatConfig
[docs] @classmethod def from_config( cls, config: DictFeatConfig, metadata: Optional[FieldMeta] = None, labels: Optional[Vocabulary] = None, tensorizer: Optional[Tensorizer] = None, ): """Factory method to construct an instance of DictEmbedding from the module's config object and the field's metadata object. Args: config (DictFeatConfig): Configuration object specifying all the parameters of DictEmbedding. metadata (FieldMeta): Object containing this field's metadata. Returns: type: An instance of DictEmbedding. """ # TODO: clean this up once fully migrated to new data handler design vocab_size = ( len(tensorizer.vocab) if tensorizer is not None else len(labels) if labels is not None else metadata.vocab_size ) tensorizer_vocab_exists = tensorizer and tensorizer.vocab pad_index = ( tensorizer.vocab.get_pad_index() if tensorizer_vocab_exists else PAD_INDEX ) unk_index = ( tensorizer.vocab.get_unk_index() if tensorizer_vocab_exists else UNK_INDEX ) return cls( num_embeddings=vocab_size, embed_dim=config.embed_dim, pooling_type=config.pooling, pad_index=pad_index, unk_index=unk_index, mobile=config.mobile, )
def __init__( self, num_embeddings: int, embed_dim: int, pooling_type: PoolingType, pad_index: int = PAD_INDEX, unk_index: int = UNK_INDEX, mobile: bool = False, ) -> None: super().__init__(embed_dim) self.unk_index = unk_index self.pad_index = pad_index self.embedding = nn.Embedding( num_embeddings, embed_dim, padding_idx=self.pad_index ) # Temporary workaround till https://github.com/pytorch/pytorch/issues/32840 # is resolved self.pooling_type = pooling_type.value self.mobile = mobile log_class_usage(__class__)
[docs] def find_and_replace( self, tensor: torch.Tensor, find_val: int, replace_val: int ) -> torch.Tensor: """ `torch.where` is not supported for mobile ONNX, this hack allows a mobile exported version of `torch.where` which is computationally more expensive """ if self.mobile: mask = torch.eq(tensor, find_val) return tensor * (1 - mask.long()) + mask * replace_val else: return torch.where( tensor == find_val, torch.full_like(tensor, replace_val), tensor )
[docs] def forward( self, feats: torch.Tensor, weights: torch.Tensor, lengths: torch.Tensor ) -> torch.Tensor: """Given a batch of sentences such containing dictionary feature ids per token, produce token embedding vectors for each sentence in the batch. Args: feats (torch.Tensor): Batch of sentences with dictionary feature ids. shape: [bsz, seq_len * max_feat_per_token] weights (torch.Tensor): Batch of sentences with dictionary feature weights for the dictionary features. shape: [bsz, seq_len * max_feat_per_token] lengths (torch.Tensor): Batch of sentences with the number of dictionary features per token. shape: [bsz, seq_len] Returns: torch.Tensor: Embedded batch of sentences. Dimension: batch size X maximum sentence length, token embedding size. Token embedding size = `embed_dim` passed to the constructor. """ batch_size = torch.onnx.operators.shape_as_tensor(feats)[0] max_toks = torch.onnx.operators.shape_as_tensor(lengths)[1] if self.unk_index != self.pad_index: # convert all unk indices to pad indices feats = self.find_and_replace(feats, self.unk_index, self.pad_index) dict_emb = self.embedding(feats) # Calculate weighted average of the embeddings weighted_embds = dict_emb * weights.unsqueeze(2) new_emb_shape = torch.cat( ( batch_size.view(1), max_toks.view(1), torch.tensor([-1]).long(), torch.tensor([weighted_embds.size()[-1]]).long(), ) ) weighted_embds = torch.onnx.operators.reshape_from_tensor_shape( weighted_embds, new_emb_shape ) # Temporary workaround till https://github.com/pytorch/pytorch/issues/32840 # is resolved if self.pooling_type == "mean": reduced_embeds = torch.sum(weighted_embds, dim=2) / lengths.unsqueeze(2).to( weighted_embds.dtype ) elif self.pooling_type == "max": reduced_embeds, _ = torch.max(weighted_embds, dim=2) else: raise RuntimeError(f"Pooling type {self.pooling_type} is unsupported.") return reduced_embeds