Source code for pytext.data.disjoint_multitask_data_handler

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import itertools
from collections import OrderedDict
from typing import Dict, Optional, Tuple

import numpy as np
from pytext.common.constants import BatchContext

from .data_handler import BatchIterator, DataHandler


[docs]class RoundRobinBatchIterator(BatchIterator):
    """
    We take a dictionary of BatchIterators and do round robin over them in a cycle.
    The below describes the behavior for one epoch, with the example

    Iterator 1: [A, B, C, D],  Iterator 2: [a, b]

    If `upsample` is True:
        If `iter_to_set_epoch` is set, cycle batches from each iterator until one epoch
        of the target iterator is fulfilled. Iterators with fewer batches than the
        target iterator are repeated, so they never run out.

        iter_to_set_epoch = "Iterator 1"
        Output: [A, a, B, b, C, a, D, b]

        If `iter_to_set_epoch` is None, cycle over batches from each iterator until the
        shortest iterator completes one epoch.

        Output: [A, a, B, b]

    If `upsample` is False:
        Iterate over batches from one epoch of each iterator, with the order among
        iterators uniformly shuffled.

        Possible output: [a, A, B, C, b, D]

    Args:
        iterators (Dict[str, BatchIterator]): Iterators to do roundrobin over.
        upsample (bool): If upsample, keep cycling over each iterator in round-robin.
          Iterators with less batches will get more passes.  If False, we do single
          pass over each iterator, in random order. Evaluation will use upsample=False.
          Default True.
        iter_to_set_epoch (Optional[str]): Name of iterator to define epoch size.
          If upsample is True and this is not set, epoch size defaults to the length of
          the shortest iterator. If upsample is False, this argument is not used.
    Attributes:
        iterators (Dict[str, BatchIterator]): Iterators to do roundrobin over.
        upsample (bool): Whether to upsample iterators with fewer batches.
        iter_to_set_epoch (str): Name of iterator to define epoch size.
    """

    def __init__(
        self,
        iterators: Dict[str, BatchIterator],
        upsample: bool = True,
        iter_to_set_epoch: Optional[str] = None,
    ) -> None:
        self.iterators = iterators
        self.upsample = upsample
        self.iter_to_set_epoch = iter_to_set_epoch

    def __iter__(self):
        return iter(self._upsample_iter() if self.upsample else self._shuffle_iter())

    def _upsample_iter(self):
        if self.iter_to_set_epoch:
            batch_per_iter = len(self.iterators[self.iter_to_set_epoch])
        else:
            batch_per_iter = min(len(iterator) for iterator in self.iterators.values())
        total_batches = len(self.iterators) * batch_per_iter
        iterators = {
            name: iter(self.cycle(iterator))
            for name, iterator in self.iterators.items()
        }

        # chain list of tuples, resulting in round robin
        round_robin = itertools.chain.from_iterable(
            # zip list of iterators,
            # return tuples with one element from each iterator
            itertools.zip_longest(
                *[  # turn into iterator of (name, batch) tuples
                    zip(itertools.repeat(name), iterator)
                    for name, iterator in iterators.items()
                ]
            )
        )

        for i, (name, (input, target, context)) in enumerate(round_robin):
            if i >= total_batches:
                # end of epoch
                return
            context[BatchContext.TASK_NAME] = name
            yield input, target, context

    def _shuffle_iter(self):
        indices = []
        iterators = []
        for i, (name, it) in enumerate(self.iterators.items()):
            indices.extend([i] * len(it))
            iterators.append((name, iter(it)))

        indices = np.array(indices)
        np.random.shuffle(indices)

        for i in indices:
            name, iterator = iterators[i]
            input, target, context = next(iterator)
            context[BatchContext.TASK_NAME] = name
            yield input, target, context

[docs]    @classmethod
    def cycle(cls, iterator):
        while True:
            for item in iterator:
                yield item


[docs]class DisjointMultitaskDataHandler(DataHandler):
    """
    Wrapper for doing multitask training using multiple data handlers.
    Takes a dictionary of data handlers, does round robin over their
    iterators using RoundRobinBatchIterator.

    Args:
        config (Config): Configuration object of type DisjointMultitaskDataHandler.Config.
        data_handlers (Dict[str, DataHandler]): Data handlers to do roundrobin over.
        target_task_name (Optional[str]): Used to select best epoch, and set batch_per_epoch.
        *args (type): Extra arguments to be passed down to sub data handlers.
        **kwargs (type): Extra arguments to be passed down to sub data handlers.

    Attributes:
        data_handlers (type): Data handlers to do roundrobin over.
        target_task_name (type): Used to select best epoch, and set batch_per_epoch.
        upsample (bool): If upsample, keep cycling over each iterator in round-robin.
          Iterators with less batches will get more passes.  If False, we do single
          pass over each iterator, the ones which run out will sit idle.  This is
          used for evaluation.  Default True.

    """

[docs]    class Config(DataHandler.Config):
        """Configuration class for `DisjointMultitaskDataHandler`.

        Attributes:
            upsample (bool): If upsample, keep cycling over each iterator in round-robin.
              Iterators with less batches will get more passes.  If False, we do single
              pass over each iterator, the ones which run out will sit idle.  This is
              used for evaluation.  Default True.

        """

        upsample: bool = True

    def __init__(
        self,
        config: Config,
        data_handlers: Dict[str, DataHandler],
        target_task_name: Optional[str] = None,
        *args,
        **kwargs
    ) -> None:
        super(DisjointMultitaskDataHandler, self).__init__(config, None, None, None)
        self.data_handlers = data_handlers
        self.upsample = config.upsample
        self.target_task_name = target_task_name

[docs]    def get_train_iter(
        self, rank: int = 0, world_size: int = 1
    ) -> Tuple[BatchIterator, ...]:
        iterators: Dict = OrderedDict(
            (name, data_handler.get_train_iter(rank, world_size))
            for name, data_handler in self.data_handlers.items()
        )
        return RoundRobinBatchIterator(
            iterators, upsample=self.upsample, iter_to_set_epoch=self.target_task_name
        )

[docs]    def get_eval_iter(self) -> BatchIterator:
        iterators: Dict = OrderedDict(
            (name, data_handler.get_eval_iter())
            for name, data_handler in self.data_handlers.items()
        )
        return RoundRobinBatchIterator(iterators, upsample=False)

[docs]    def get_test_iter(self) -> BatchIterator:
        iterators: Dict = OrderedDict(
            (name, data_handler.get_test_iter())
            for name, data_handler in self.data_handlers.items()
        )
        return RoundRobinBatchIterator(iterators, upsample=False)

[docs]    def init_metadata(self):
        # get data sets
        self.metadata = {}
        for name, data_handler in self.data_handlers.items():
            data_handler.init_metadata()
            self.metadata[name] = data_handler.metadata

[docs]    def load_metadata(self, metadata):
        self.metadata = metadata
        for name, data_handler in self.data_handlers.items():
            data_handler.load_metadata(metadata[name])

[docs]    def metadata_to_save(self):
        metadata = {}
        for name, data_handler in self.data_handlers.items():
            metadata[name] = data_handler.metadata_to_save()
        return metadata