Source code for pytext.models.representations.deepcnn

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math

import torch
import torch.nn as nn
from pytext.config.module_config import Activation, CNNParams, PoolingType
from pytext.models.representations.representation_base import RepresentationBase
from pytext.optimizer import get_activation
from pytext.utils.usage import log_class_usage


[docs]def pool(pooling_type, words):
    # input dims: bsz * seq_len * num_filters
    if pooling_type == PoolingType.MEAN:
        return words.mean(dim=1)
    elif pooling_type == PoolingType.MAX:
        return words.max(dim=1)[0]
    elif pooling_type == PoolingType.NONE:
        return words
    else:
        return NotImplementedError


[docs]class Trim1d(nn.Module):
    """
    Trims a 1d convolutional output. Used to implement history-padding
    by removing excess padding from the right.

    """

    def __init__(self, trim):
        super(Trim1d, self).__init__()

        self.trim = trim

[docs]    def forward(self, x):
        return x[:, :, : -self.trim].contiguous()


[docs]class SeparableConv1d(nn.Module):
    """
    Implements a 1d depthwise separable convolutional layer. In regular convolutional
    layers, the input channels are mixed with each other to produce each output channel.
    Depthwise separable convolutions decompose this process into two smaller
    convolutions -- a depthwise and pointwise convolution.

    The depthwise convolution spatially convolves each input channel separately,
    then the pointwise convolution projects this result into a new channel space.
    This process reduces the number of FLOPS used to compute a convolution and also
    exhibits a regularization effect. The general behavior -- including the input
    parameters -- is equivalent to `nn.Conv1d`.

    `bottleneck` controls the behavior of the pointwise convolution. Instead of
    upsampling directly, we split the pointwise convolution into two pieces: the first
    convolution downsamples into a (sufficiently small) low dimension and the
    second convolution upsamples into the target (higher) dimension. Creating this
    bottleneck significantly cuts the number of parameters with minimal loss
    in performance.

    """

    def __init__(
        self,
        input_channels: int,
        output_channels: int,
        kernel_size: int,
        padding: int,
        dilation: int,
        bottleneck: int,
    ):
        super(SeparableConv1d, self).__init__()

        conv_layers = [
            nn.Conv1d(
                input_channels,
                input_channels,
                kernel_size,
                padding=padding,
                dilation=dilation,
                groups=input_channels,
            )
        ]

        if bottleneck > 0:
            conv_layers.extend(
                [
                    nn.Conv1d(input_channels, bottleneck, 1),
                    nn.Conv1d(bottleneck, output_channels, 1),
                ]
            )
        else:
            conv_layers.append(nn.Conv1d(input_channels, output_channels, 1))

        self.conv = nn.Sequential(*conv_layers)

[docs]    def forward(self, x):
        return self.conv(x)


[docs]def create_conv_package(
    index: int,
    activation: Activation,
    in_channels: int,
    out_channels: int,
    kernel_size: int,
    causal: bool,
    dilated: bool,
    separable: bool,
    bottleneck: int,
    weight_norm: bool,
):
    """
    Creates a convolutional layer with the specified arguments.

    Args:
        index (int): Index of a convolutional layer in the stack.
        activation (Activation): Activation function.
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        kernel_size (int): Size of 1d convolutional filter.
        causal (bool): Whether the convolution is causal or not. If set, it
        accounts for the temporal ordering of the inputs.
        dilated (bool): Whether the convolution is dilated or not. If set,
        the receptive field of the convolutional stack grows exponentially.
        separable (bool): Whether to use depthwise separable convolutions
        or not -- see `SeparableConv1d`.
        bottleneck (int): Bottleneck channel dimension for depthwise separable
        convolutions. See `SeparableConv1d` for an in-depth explanation.
        weight_norm (bool): Whether to add weight normalization to the
        regular convolutions or not.

    """

    if not separable and bottleneck > 0:
        raise RuntimeError(
            "Bottleneck layers can only be used with separable convolutions"
        )

    if separable and weight_norm:
        raise RuntimeError(
            "Weight normalization is not supported for separable convolutions"
        )

    def _compute_dilation(index, dilated):
        """
        If set, the dilation factor increases by a factor of two for each
        successive convolution to increase the receptive field exponentially.

        """

        if dilated:
            return 2 ** index
        return 1

    def _compute_padding(kernel_size, dilation, causal):
        """
        Non-causal convolutions are centered, so they will consume ((k - 1) // 2) * d
        padding on both the left and the right of the sequence. Causal convolutions
        are shifted to the left (to account for temporal ordering), so they will
        only consume padding from the left. Therefore, we pad this side with the
        full amount (k - 1) * d and remove the excess right-padding with `Trim1d`.

        """

        if causal:
            return (kernel_size - 1) * dilation
        return ((kernel_size - 1) // 2) * dilation

    def _compute_out_channels(out_channels, activation):
        """
        Gated Linear Unit (GLU) activations train two groups of convolutions,
        then linearly combine their outputs through a gating mechanism. We
        double the number of `out_channels` to mimic these two groups.

        """

        if activation == Activation.GLU:
            return out_channels * 2
        return out_channels

    package = []
    dilation = _compute_dilation(index, dilated)
    padding = _compute_padding(kernel_size, dilation, causal)
    out_channels = _compute_out_channels(out_channels, activation)

    if separable:
        package.append(
            SeparableConv1d(
                in_channels, out_channels, kernel_size, padding, dilation, bottleneck
            )
        )
    else:
        conv = nn.Conv1d(
            in_channels, out_channels, kernel_size, padding=padding, dilation=dilation
        )
        if weight_norm:
            conv = nn.utils.weight_norm(conv)
        package.append(conv)

    if causal:
        package.append(Trim1d(padding))

    return package[0] if len(package) == 1 else nn.Sequential(*package)


[docs]class DeepCNNRepresentation(RepresentationBase):
    """
    `DeepCNNRepresentation` implements CNN representation layer
    preceded by a dropout layer. CNN representation layer is based on the encoder
    in the architecture proposed by Gehring et. al. in Convolutional Sequence to
    Sequence Learning.

    Args:
        config (Config): Configuration object of type DeepCNNRepresentation.Config.
        embed_dim (int): The number of expected features in the input.

    """

[docs]    class Config(RepresentationBase.Config):
        cnn: CNNParams = CNNParams()
        dropout: float = 0.3
        activation: Activation = Activation.GLU
        separable: bool = False
        bottleneck: int = 0
        pooling_type: PoolingType = PoolingType.NONE

    def __init__(self, config: Config, embed_dim: int) -> None:
        super().__init__(config)

        out_channels = config.cnn.kernel_num
        kernel_sizes = config.cnn.kernel_sizes
        weight_norm = config.cnn.weight_norm
        dilated = config.cnn.dilated
        causal = config.cnn.causal

        activation = config.activation
        pooling_type = config.pooling_type
        separable = config.separable
        bottleneck = config.bottleneck

        conv_layers = {}
        linear_layers = {}
        in_channels = embed_dim

        for i, k in enumerate(kernel_sizes):
            assert (k - 1) % 2 == 0

            if in_channels != out_channels:
                linear_layers[str(i)] = nn.Linear(in_channels, out_channels)

            single_conv = create_conv_package(
                index=i,
                activation=activation,
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=k,
                causal=causal,
                dilated=dilated,
                separable=separable,
                bottleneck=bottleneck,
                weight_norm=weight_norm,
            )
            conv_layers[str(i)] = single_conv

            in_channels = out_channels

        self.convs = nn.ModuleDict(conv_layers)
        self.projections = nn.ModuleDict(linear_layers)
        self.activation = get_activation(activation)
        self.pooling_type = pooling_type

        self.representation_dim = out_channels
        self.dropout = nn.Dropout(p=config.dropout)
        log_class_usage(__class__)

[docs]    def forward(self, inputs: torch.Tensor, *args) -> torch.Tensor:
        inputs = self.dropout(inputs)
        # bsz * seq_len * embed_dim -> bsz * embed_dim * seq_len
        words = inputs.permute(0, 2, 1)
        convs_keys = self.convs.keys()
        projections_keys = self.projections.keys()
        # Extra verbosity is due to jit.script.
        for k in convs_keys:
            conv = self.convs[k]
            if k not in projections_keys:
                residual = words
            else:
                proj = self.projections[k]
                tranposed = words.permute(0, 2, 1)
                residual = proj(tranposed).permute(0, 2, 1)
            words = conv(words)
            words = self.activation(words)
            words = (words + residual) * math.sqrt(0.5)
        words = words.permute(0, 2, 1)
        return pool(self.pooling_type, words)