Source code for pytext.models.embeddings.char_embedding

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import List, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from pytext.config.field_config import CharFeatConfig
from pytext.data.utils import Vocabulary
from pytext.fields import FieldMeta
from pytext.utils.usage import log_class_usage

from .embedding_base import EmbeddingBase


[docs]class CharacterEmbedding(EmbeddingBase): """ Module for character aware CNN embeddings for tokens. It uses convolution followed by max-pooling over character embeddings to obtain an embedding vector for each token. Implementation is loosely based on https://arxiv.org/abs/1508.06615. Args: num_embeddings (int): Total number of characters (vocabulary size). embed_dim (int): Size of character embeddings to be passed to convolutions. out_channels (int): Number of output channels. kernel_sizes (List[int]): Dimension of input Tensor passed to MLP. highway_layers (int): Number of highway layers applied to pooled output. projection_dim (int): If specified, size of output embedding for token, via a linear projection from convolution output. Attributes: char_embed (nn.Embedding): Character embedding table. convs (nn.ModuleList): Convolution layers that operate on character embeddings. highway_layers (nn.Module): Highway layers on top of convolution output. projection (nn.Module): Final linear layer to token embedding. embedding_dim (int): Dimension of the final token embedding produced. """ Config = CharFeatConfig
[docs] @classmethod def from_config( cls, config: CharFeatConfig, metadata: Optional[FieldMeta] = None, vocab_size: Optional[int] = None, ): """Factory method to construct an instance of CharacterEmbedding from the module's config object and the field's metadata object. Args: config (CharFeatConfig): Configuration object specifying all the parameters of CharacterEmbedding. metadata (FieldMeta): Object containing this field's metadata. Returns: type: An instance of CharacterEmbedding. """ if vocab_size is None: vocab_size = metadata.vocab_size return cls( vocab_size, config.embed_dim, config.cnn.kernel_num, config.cnn.kernel_sizes, config.highway_layers, config.projection_dim, )
def __init__( self, num_embeddings: int, embed_dim: int, out_channels: int, kernel_sizes: List[int], highway_layers: int, projection_dim: Optional[int], *args, **kwargs, ) -> None: conv_out_dim = len(kernel_sizes) * out_channels output_dim = projection_dim or conv_out_dim super().__init__(output_dim) self.char_embed = nn.Embedding(num_embeddings, embed_dim) self.convs = nn.ModuleList( [ # in_channels = embed_dim because input is treated as sequence # of dim [max_word_length] with embed_dim channels # Adding padding to provide robustness in cases where input # length is less than conv filter width nn.Conv1d(embed_dim, out_channels, K, padding=K // 2) for K in kernel_sizes ] ) self.highway = None if highway_layers > 0: self.highway = Highway(conv_out_dim, highway_layers) self.projection = None if projection_dim: self.projection = nn.Linear(conv_out_dim, projection_dim) log_class_usage(__class__)
[docs] def forward(self, chars: torch.Tensor) -> torch.Tensor: """ Given a batch of sentences such that tokens are broken into character ids, produce token embedding vectors for each sentence in the batch. Args: chars (torch.Tensor): Batch of sentences where each token is broken into characters. Dimension: batch size X maximum sentence length X maximum word length Returns: torch.Tensor: Embedded batch of sentences. Dimension: batch size X maximum sentence length, token embedding size. Token embedding size = `out_channels * len(self.convs))` """ batch_size = chars.size(0) max_sent_length = chars.size(1) max_word_length = chars.size(2) chars = chars.view(batch_size * max_sent_length, max_word_length) # char_embedding: (bsize * max_sent_length, max_word_length, embed_dim) char_embedding = self.char_embed(chars) # conv_inp dim: (bsize * max_sent_length, emb_size, max_word_length) conv_inp = char_embedding.transpose(1, 2) char_conv_outs = [F.relu(conv(conv_inp)) for conv in self.convs] # Apply max pooling # char_pool_out[i] dims: (bsize * max_sent_length, out_channels) char_pool_outs = [torch.max(out, dim=2)[0] for out in char_conv_outs] # Concat different feature maps together # char_pool_out dim: (bsize * max_sent_length, out_channel * num_kernels) char_out = torch.cat(char_pool_outs, 1) # Highway layers, preserves dims if self.highway is not None: char_out = self.highway(char_out) if self.projection is not None: # Linear map back to final embedding size: # (bsize * max_sent_length, projection_dim) char_out = self.projection(char_out) # Reshape to (bsize, max_sent_length, "output_dim") return char_out.view(batch_size, max_sent_length, -1)
[docs]class Highway(nn.Module): """ A `Highway layer <https://arxiv.org/abs/1505.00387>`. Adopted from the AllenNLP implementation. """ def __init__(self, input_dim: int, num_layers: int = 1): super().__init__() self.input_dim = input_dim self.layers = nn.ModuleList( [nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)] ) self.activation = nn.ReLU() self.reset_parameters()
[docs] def reset_parameters(self): for layer in self.layers: # As per comment in AllenNLP: # We should bias the highway layer to just carry its input forward. We do # that by setting the bias on `B(x)` to be positive, because that means `g` # will be biased to be high, so we will carry the input forward. The bias # on `B(x)` is the second half of the bias vector in each Linear layer. nn.init.constant_(layer.bias[self.input_dim :], 1) nn.init.constant_(layer.bias[: self.input_dim], 0) nn.init.xavier_normal_(layer.weight)
[docs] def forward(self, x: torch.Tensor): for layer in self.layers: projection = layer(x) proj_x, gate = projection.chunk(2, dim=-1) proj_x = self.activation(proj_x) gate = F.sigmoid(gate) x = gate * x + (gate.new_tensor([1]) - gate) * proj_x return x