Source code for pytext.models.representations.attention

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import torch
import torch.nn as nn
import torch.nn.functional as F
from pytext.models.module import Module
from pytext.utils.usage import log_class_usage


[docs]class DotProductSelfAttention(Module):
    """
    Given vector w and token vectors = {t1, t2, ..., t_n}, compute self attention
    weights to weighs the tokens
    * a_j = softmax(w . t_j)
    """

[docs]    class Config(Module.Config):
        input_dim: int = 32

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(config.input_dim)

    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)
        log_class_usage(__class__)

[docs]    def forward(self, tokens, tokens_mask):
        """
        Input:
            x: batch_size * seq_len * input_dim
            x_mask: batch_size * seq_len (1 for padding, 0 for true)
        Output:
            alpha: batch_size * seq_len
        """
        scores = self.linear(tokens).squeeze(2)
        scores.data.masked_fill_(tokens_mask.data, -float("inf"))
        return F.softmax(scores, dim=-1)


[docs]class SequenceAlignedAttention(Module):
    """
    Given sequences P and Q, computes attention weights for each element in P by
    matching Q with each element in P.
    * a_i_j = softmax(p_i . q_j) where softmax is computed by summing over q_j
    """

[docs]    class Config(Module.Config):
        proj_dim: int = 32

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(config.proj_dim)

    def __init__(self, proj_dim):
        super().__init__()
        self.linear = nn.Linear(proj_dim, proj_dim)
        self.proj_dim = proj_dim
        log_class_usage(__class__)

[docs]    def forward(self, p: torch.Tensor, q: torch.Tensor, q_mask: torch.Tensor):
        """
        Input:
            p: batch_size * p_seq_len * dim
            q: batch_size * q_seq_len * dim
            q_mask: batch_size * q_seq_len (1 for padding, 0 for true)
        Output:
            matched_seq: batch_size * doc_seq_len * dim
        """
        p_transform = F.relu(self.linear(p))
        q_transform = F.relu(self.linear(q))

        # Compute scores s_ij: bsz * doc_seq_len * ques_seq_len
        attn_scores = p_transform.bmm(q_transform.transpose(2, 1))

        # Mask padding: set a very low score for ques tokens that are pads.
        q_mask = q_mask.unsqueeze(1).expand(attn_scores.size())
        attn_scores.data.masked_fill_(q_mask.data, -float("inf"))

        # Normalize with softmax: bsz * doc_seq_len * ques_seq_len
        attn_scores_flattened = F.softmax(attn_scores.view(-1, q.size(1)), dim=-1)
        return attn_scores_flattened.view(-1, p.size(1), q.size(1))


[docs]class MultiplicativeAttention(Module):
    """
    Given sequence P and vector q, computes attention weights for each element
    in P by matching q with each element in P using multiplicative attention.
    * a_i = softmax(p_i . W . q)
    """

[docs]    class Config(Module.Config):
        p_hidden_dim: int = 32
        q_hidden_dim: int = 32
        normalize: bool = False

[docs]    @classmethod
    def from_config(cls, config: Config):
        return cls(config.p_hidden_dim, config.q_hidden_dim, config.normalize)

    def __init__(self, p_hidden_dim, q_hidden_dim, normalize):
        super().__init__()
        self.normalize = normalize
        self.linear = nn.Linear(p_hidden_dim, q_hidden_dim)
        log_class_usage(__class__)

[docs]    def forward(self, p_seq: torch.Tensor, q: torch.Tensor, p_mask: torch.Tensor):
        """
        Input:
            p_seq: batch_size * p_seq_len * p_hidden_dim
            q: batch_size * q_hidden_dim
            p_mask: batch_size * p_seq_len (1 for padding, 0 for true)
        Output:
            attn_scores: batch_size * p_seq_len
        """
        Wq = self.linear(q) if self.linear is not None else q
        pWq = p_seq.bmm(Wq.unsqueeze(2)).squeeze(2)
        pWq.data.masked_fill_(p_mask.data, -float("inf"))
        attn_scores = F.softmax(pWq, dim=-1) if self.normalize else pWq.exp()
        return attn_scores