Source code for pytext.torchscript.tensorizer.tensorizer

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import List, Optional, Tuple

import torch
from pytext.torchscript.utils import (
    pad_2d,
    pad_2d_float,
    pad_3d_float,
    validate_padding_control,
)
from pytext.torchscript.vocab import ScriptVocabulary


[docs]class ScriptTensorizer(torch.jit.ScriptModule):
    device: str
    seq_padding_control: Optional[List[int]]
    batch_padding_control: Optional[List[int]]

    def __init__(self):
        super().__init__()
        self.device = ""
        self.seq_padding_control = torch.jit.Attribute(None, Optional[List[int]])
        self.batch_padding_control = torch.jit.Attribute(None, Optional[List[int]])

    @torch.jit.script_method
    def set_device(self, device: str):
        self.device = device

[docs]    @torch.jit.export
    def set_padding_control(self, dimension: str, padding_control: Optional[List[int]]):
        """
        This functions will be called to set a padding style.
        None - No padding
        List: first element 0, round seq length to the smallest list element larger than inputs
        """
        if not validate_padding_control(padding_control):
            raise RuntimeError("Malformed padding_control value")
        if dimension == "sequence_length":
            self.seq_padding_control = padding_control
        elif dimension == "batch_length":
            self.batch_padding_control = padding_control
        else:
            raise RuntimeError("Illegal padding dimension specified.")

    @torch.jit.script_method
    def tokenize(
        self, text_row: Optional[List[str]], token_row: Optional[List[List[str]]]
    ):
        """
        Process a single line of raw inputs into tokens, it supports
        two input formats:
            1) a single line of texts (single sentence or a pair)
            2) a single line of pre-processed tokens (single sentence or a pair)
        """
        raise NotImplementedError

    @torch.jit.script_method
    def numberize(
        self, text_row: Optional[List[str]], token_row: Optional[List[List[str]]]
    ):
        """
        Process a single line of raw inputs into numberized result, it supports
        two input formats:
            1) a single line of texts (single sentence or a pair)
            2) a single line of pre-processed tokens (single sentence or a pair)

        This function should handle the logic of calling tokenize(), add special
        tokens and vocab lookup.
        """
        raise NotImplementedError

    @torch.jit.script_method
    def tensorize(
        self,
        texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[List[str]]]] = None,
    ):
        """
        Process raw inputs into model input tensors, it supports two input
        formats:
            1) multiple rows of texts (single sentence or a pair)
            2) multiple rows of pre-processed tokens (single sentence or a pair)

        This function should handle the logic of calling numberize() and also
        padding the numberized result.
        """
        raise NotImplementedError

    @torch.jit.script_method
    def batch_size(
        self, texts: Optional[List[List[str]]], tokens: Optional[List[List[List[str]]]]
    ) -> int:
        if texts is not None:
            return len(texts)
        elif tokens is not None:
            return len(tokens)
        else:
            raise RuntimeError("Empty input for both texts and tokens.")

    @torch.jit.script_method
    def row_size(
        self,
        texts_list: Optional[List[List[str]]] = None,
        tokens_list: Optional[List[List[List[str]]]] = None,
    ) -> int:
        if texts_list is not None:
            return len(texts_list[0])
        elif tokens_list is not None:
            return len(tokens_list[0])
        else:
            raise RuntimeError("Empty input for both texts and tokens.")

    @torch.jit.script_method
    def get_texts_by_index(
        self, texts: Optional[List[List[str]]], index: int
    ) -> Optional[List[str]]:
        if texts is None:
            return None
        return texts[index]

    @torch.jit.script_method
    def get_tokens_by_index(
        self, tokens: Optional[List[List[List[str]]]], index: int
    ) -> Optional[List[List[str]]]:
        if tokens is None:
            return None
        return tokens[index]


[docs]class VocabLookup(torch.jit.ScriptModule):
    """
    TorchScript implementation of lookup_tokens() in pytext/data/tensorizers.py
    """

    def __init__(self, vocab: ScriptVocabulary):
        super().__init__()
        self.vocab = vocab

    @torch.jit.script_method
    def forward(
        self,
        tokens: List[Tuple[str, int, int]],
        bos_idx: Optional[int] = None,
        eos_idx: Optional[int] = None,
        use_eos_token_for_bos: bool = False,
        max_seq_len: int = 2 ** 30,
    ) -> Tuple[List[int], List[int], List[int]]:
        """Convert tokens into ids by doing vocab look-up.

        Convert tokens into ids by doing vocab look-up. It will also append
        bos & eos index into token_ids if needed. A token is represented by
        a Tuple[str, int, int], which is [token, start_index, end_index].

        Args:
            tokens: List of tokens with start and end position in the original
                text. start and end index could be optional (e.g value is -1)
            bos_idx: index of begin of sentence, optional.
            eos_idx: index of end of sentence, optional.
            use_eos_token_for_bos: use eos index as bos.
            max_seq_len: maximum tokens length.
        """
        # unwrap Optional typing
        if bos_idx is None:
            bos_idx = -1
        if eos_idx is None:
            eos_idx = -1

        text_tokens: List[str] = []
        start_idxs: List[int] = []
        end_idxs: List[int] = []

        max_seq_len = (
            max_seq_len - (1 if bos_idx >= 0 else 0) - (1 if eos_idx >= 0 else 0)
        )
        for i in range(min(len(tokens), max_seq_len)):
            token: Tuple[str, int, int] = tokens[i]
            text_tokens.append(token[0])
            start_idxs.append(token[1])
            end_idxs.append(token[2])

        # vocab lookup
        token_ids: List[int] = self.vocab.lookup_indices_1d(text_tokens)
        # add bos and eos index if needed
        if bos_idx >= 0:
            if use_eos_token_for_bos:
                bos_idx = eos_idx
            token_ids = [bos_idx] + token_ids
            start_idxs = [-1] + start_idxs
            end_idxs = [-1] + end_idxs
        if eos_idx >= 0:
            token_ids.append(eos_idx)
            start_idxs.append(-1)
            end_idxs.append(-1)
        return token_ids, start_idxs, end_idxs


[docs]class ScriptInteger1DListTensorizer(torch.jit.ScriptModule):
    """
    TorchScript implementation of Integer1DListTensorizer in pytext/data/tensorizers.py
    """

    def __init__(self):
        super().__init__()
        self.pad_idx = 0

    @torch.jit.script_method
    def numberize(self, integerList: List[int]) -> Tuple[List[int], int]:
        return integerList, len(integerList)

    @torch.jit.script_method
    def tensorize(
        self, integerList: List[List[int]], seq_lens: List[int]
    ) -> torch.Tensor:
        integerListTensor = torch.tensor(
            pad_2d(integerList, seq_lens=seq_lens, pad_idx=self.pad_idx),
            dtype=torch.long,
        )
        return integerListTensor

[docs]    @torch.jit.ignore
    def torchscriptify(self):
        return torch.jit.script(self)


[docs]class ScriptFloat1DListTensorizer(torch.jit.ScriptModule):
    """
    TorchScript implementation of Float1DListTensorizer in pytext/data/tensorizers.py
    """

    def __init__(self):
        super().__init__()
        self.pad_val = 1.0

    @torch.jit.script_method
    def numberize(self, floatList: List[float]) -> Tuple[List[float], int]:
        return floatList, len(floatList)

    @torch.jit.script_method
    def tensorize(
        self, floatLists: List[List[float]], seq_lens: List[int]
    ) -> torch.Tensor:
        floatListTensor = torch.tensor(
            pad_2d_float(floatLists, seq_lens=seq_lens, pad_val=self.pad_val),
            dtype=torch.float,
        )
        return floatListTensor

[docs]    def torchscriptify(self):
        return torch.jit.script(self)


[docs]class ScriptFloatListSeqTensorizer(torch.jit.ScriptModule):
    """
    TorchScript implementation of ScriptFloatListSeqTensorizer in pytext/data/tensorizers.py
    """

    def __init__(self, pad_token):
        super().__init__()
        self.pad_val = pad_token

    @torch.jit.script_method
    def numberize(self, floatList: List[List[float]]) -> Tuple[List[List[float]], int]:
        return (floatList, len(floatList))

    @torch.jit.script_method
    def tensorize(
        self, floatLists: List[List[List[float]]], seq_lens: List[int]
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        floatListTensor = torch.tensor(
            pad_3d_float(floatLists, seq_lens=seq_lens, pad_val=self.pad_val),
            dtype=torch.float,
        )
        seqLensTensor = torch.tensor(seq_lens, dtype=torch.long)
        return (floatListTensor, seqLensTensor)

[docs]    def torchscriptify(self):
        return torch.jit.script(self)