Source code for pytext.fields.dict_field

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from collections import Counter
from typing import List, Tuple

import torch
from pytext.common.constants import VocabMeta
from pytext.utils.data import no_tokenize
from torchtext import vocab

try:
    from torchtext.legacy import data as textdata
except ImportError:
    from torchtext import data as textdata

from .field import VocabUsingField


[docs]class DictFeatureField(VocabUsingField): dummy_model_input = ( torch.tensor([[1], [1]], dtype=torch.long, device="cpu"), torch.tensor([[1.5], [2.5]], dtype=torch.float, device="cpu"), torch.tensor([[1], [1]], dtype=torch.long, device="cpu"), ) def __init__( self, pad_token=VocabMeta.PAD_TOKEN, unk_token=VocabMeta.UNK_TOKEN, batch_first=True, left_pad=False, **kwargs, ): super().__init__( sequential=True, batch_first=batch_first, pad_first=left_pad, tokenize=no_tokenize, use_vocab=True, pad_token=pad_token, unk_token=unk_token, )
[docs] def build_vocab(self, *args, **kwargs): sources = [] for arg in args: if isinstance(arg, textdata.Dataset): sources += [ getattr(arg, name) for name, field in arg.fields.items() if field is self ] else: sources.append(arg) counter = Counter() for data in sources: for x in data: if len(x) > 0: counter.update(x[0]) specials = [self.unk_token, self.pad_token] self.vocab = vocab.Vocab(counter, specials=specials, **kwargs)
[docs] def pad( self, minibatch: List[Tuple[List[int], List[float], List[int]]] ) -> Tuple[List[List[int]], List[List[float]], List[int]]: # Pad a minibatch of dictionary features to be # batch_size * max_number_of_words * max_number_of_features # unpack the minibatch feats, weights, lengths = [], [], [] for (fs, ws, ls) in minibatch: feats.append(fs) weights.append(ws) lengths.append(ls) lengths_flattened = [l for l_list in lengths for l in l_list] seq_lens = [len(l_list) for l_list in lengths] max_ex_len = self.pad_length(max(seq_lens)) max_feat_len = max(lengths_flattened) all_lengths, all_feats, all_weights = [], [], [] for i, seq_len in enumerate(seq_lens): ex_feats, ex_weights, ex_lengths = [], [], [] feats_lengths, feats_vals, feats_weights = lengths[i], feats[i], weights[i] max_feat_len_example = max(feats_lengths) r_offset = 0 for _ in feats_lengths: # The dict feats obtained from the featurizer will have necessary # padding at the utterance level. Therefore we move the offset by # max feature length in the example. ex_feats.extend(feats_vals[r_offset : r_offset + max_feat_len_example]) ex_feats.extend( [self.pad_token] * (max_feat_len - max_feat_len_example) ) ex_weights.extend( feats_weights[r_offset : r_offset + max_feat_len_example] ) ex_weights.extend([0.0] * (max_feat_len - max_feat_len_example)) r_offset += max_feat_len_example ex_lengths.extend(feats_lengths) # Pad examples ex_padding = (max_ex_len - seq_len) * max_feat_len if self.pad_first: # left padding ex_feats = [self.pad_token] * ex_padding + ex_feats ex_weights = [0.0] * ex_padding + ex_weights ex_lengths = [1] * (max_ex_len - seq_len) + ex_lengths else: # right padding ex_feats.extend([self.pad_token] * ex_padding) ex_weights.extend([0.0] * ex_padding) ex_lengths.extend([1] * (max_ex_len - seq_len)) all_feats.append(ex_feats) all_weights.append(ex_weights) all_lengths.append(ex_lengths) return all_feats, all_weights, all_lengths
[docs] def numericalize(self, arr, device=None): feats, weights, lengths = arr weights = torch.tensor(weights, dtype=torch.float, device=device) lengths = torch.tensor(lengths, dtype=torch.long, device=device) feats = [[self.vocab.stoi[x] for x in ex] for ex in feats] feats = torch.tensor(feats, dtype=self.dtype, device=device) if not self.batch_first: arr.t_() weights.t_() feats = feats.contiguous() return feats, weights, lengths