Source code for pytext.data.data

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import functools
import itertools
import math
import random
from typing import Any, Dict, Iterable, List, MutableMapping, NamedTuple, Optional, Type

from pytext.common.constants import RawExampleFieldName, Stage
from pytext.config.component import Component, ComponentType, Registry, create_component
from pytext.utils.usage import log_class_usage

from .sources import DataSource, RawExample, TSVDataSource
from .sources.data_source import (
    GeneratorIterator,
    RowShardedDataSource,
    ShardedDataSource,
)
from .tensorizers import MetricTensorizer, Tensorizer, initialize_tensorizers


[docs]class RowData(NamedTuple): raw_data: RawExample numberized: RawExample
[docs]class BatchData(NamedTuple): raw_data: List[RawExample] numberized: Dict[str, List[Any]]
[docs]class Batcher(Component): """Batcher designed to batch rows of data, before padding.""" __COMPONENT_TYPE__ = ComponentType.BATCHER __EXPANSIBLE__ = True
[docs] class Config(Component.Config): #: Make batches of this size when possible. If there's not enough data, #: might generate some smaller batches. train_batch_size: int = 16 eval_batch_size: int = 16 test_batch_size: int = 16
[docs] @classmethod def from_config(cls, config: Config): return cls( config.train_batch_size, config.eval_batch_size, config.test_batch_size )
def __init__( self, train_batch_size=Config.train_batch_size, eval_batch_size=Config.eval_batch_size, test_batch_size=Config.test_batch_size, ): self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.test_batch_size = test_batch_size self._batch_sizes = { Stage.TRAIN: self.train_batch_size, Stage.TEST: self.test_batch_size, Stage.EVAL: self.eval_batch_size, }
[docs] def batchify( self, iterable: Iterable[RawExample], sort_key=None, stage=Stage.TRAIN ): """Group rows by batch_size. Assume iterable of dicts, yield dict of lists. The last batch will be of length len(iterable) % batch_size.""" batch_size = self._batch_sizes[stage] for batch in self._group_iter(iterable, batch_size, sort_key): raw_batch, numberized_batch = zip(*batch) yield BatchData(raw_batch, zip_dicts(numberized_batch))
def _group_iter(self, iterable: Iterable[RawExample], group_size, sort_key=None): iterators = [iter(iterable)] * group_size for group in itertools.zip_longest(*iterators): group = [ex for ex in group if ex is not None] if sort_key: group.sort(key=sort_key, reverse=True) yield group
[docs]class PoolingBatcher(Batcher): """ Batcher that shuffles and (if requested) sorts data. **Rationale** There is a trade-off between having batches of data that are truly randomly shuffled, and batches of data that are efficiently padded. If we wanted to maximise the efficiency of padding (i.e. minimise the amount of padding that is needed), we would have to enforce that all inputs of a similar length appear in the same batch. This however would lead to a dramatic decrease in the randomness of batches. On the other end of the spectrum, if we wanted to maximise randomness, we would often end up with inputs of wildly different lengths in the same batch, which would lead to a lot of padding. **Operation** This batcher uses a multi-staged approach. 1. It first loads a number of "pools" of data, and shuffles them (this is controlled by `num_shuffled_pools`). 2. It then splits up the shuffled data sequentially into individual pools, and the examples within each pool are sorted (if requested). 3. Finally, each pool is split up sequentially into batches, and yielded. If sorting was requested in step #2, the order in which the batches are yielded is randomised. The size of a pool is expressed as a multiple of the batch size, and is controlled by `pool_num_batches`. **Examples** Assuming sorting is enabled, with the default settings of `pool_num_batches: 1000` and `num_shuffled_pools: 1`, a pool of `1k * batch_size` examples is loaded, sorted by length, and split up into 1k batches. These batches are then yielded in random order. Once they run out, a new pool is loaded, and the process is repeated. An advantage of this approach is that padding will be somewhat reduced. A disadvantage is that, for every epoch, the first 1k batches will be always the same (albeit in a different order). On the other hand, specifying `pool_num_batches: 1000` and `num_shuffled_pools: 1000` would achieve the following: `1k * 1k * batch_size` examples are loaded, and shuffled. These are then split up into pools of size `1k * batch_size`, which are then sorted internally, split into individual batches, and yielded in random order. Compared to the previous example, we no longer have the problem that the first 1k batches are always the same in each epoch, but we've had to load in memory 1M examples. """
[docs] class Config(Batcher.Config): #: Size of a pool expressed in number of batches pool_num_batches: int = 1000 #: How many pool-sized chunks to load at a time for shuffling num_shuffled_pools: int = 1
[docs] @classmethod def from_config(cls, config: Config): return cls( config.train_batch_size, config.eval_batch_size, config.test_batch_size, config.pool_num_batches, config.num_shuffled_pools, )
def __init__( self, train_batch_size=Config.train_batch_size, eval_batch_size=Config.eval_batch_size, test_batch_size=Config.test_batch_size, pool_num_batches=Config.pool_num_batches, num_shuffled_pools=Config.num_shuffled_pools, ): super().__init__(train_batch_size, eval_batch_size, test_batch_size) assert pool_num_batches >= 1 and num_shuffled_pools >= 1 self.pool_num_batches = pool_num_batches self.num_shuffled_pools = num_shuffled_pools
[docs] def get_batch_size(self, stage: Stage) -> int: return self._batch_sizes[stage]
[docs] def batchify( self, iterable: Iterable[RawExample], sort_key=None, stage=Stage.TRAIN ): """ From an iterable of dicts, yield dicts of lists: 1. Load `num_shuffled_pools` pools of data, and shuffle them. 2. Load a pool (`batch_size * pool_num_batches` examples). 3. Sort rows, if necessary. 4. Shuffle the order in which the batches are returned, if necessary. """ batch_size = self.get_batch_size(stage) pool_size = batch_size * self.pool_num_batches super_pool_size = pool_size * self.num_shuffled_pools for super_pool in self._group_iter(iterable, super_pool_size, None): # No point in shuffling if we're loading a single pool which is then sorted. if self.num_shuffled_pools > 1 or sort_key is None: random.shuffle(super_pool) for pool in self._group_iter(super_pool, pool_size, sort_key): batch_indices = list(range(math.ceil(len(pool) / batch_size))) if sort_key: random.shuffle(batch_indices) for batch_index in batch_indices: batch = pool[ batch_size * batch_index : batch_size * (batch_index + 1) ] raw_batch, numberized_batch = zip(*batch) yield BatchData(raw_batch, zip_dicts(numberized_batch))
[docs]def pad_and_tensorize_batches(tensorizers, batches): for raw_batch, numberized_batch in batches: tensor_dict = {} for name, tensorizer in tensorizers.items(): if isinstance(tensorizer, MetricTensorizer): tensor_dict[name] = tensorizer.tensorize(numberized_batch) else: tensor_dict[name] = tensorizer.tensorize(numberized_batch[name]) yield raw_batch, tensor_dict
[docs]def zip_dicts(dicts): all_keys = set(itertools.chain.from_iterable(dicts)) zipped = {key: [] for key in all_keys} for d in dicts: for key in all_keys: zipped[key].append(d.get(key)) return zipped
[docs]def generator_iterator(fn): """Turn a generator into a GeneratorIterator-wrapped function. Effectively this allows iterating over a generator multiple times by recording the call arguments, and calling the generator with them anew each item __iter__ is called on the returned object.""" @functools.wraps(fn) def wrapped(*args, **kwargs): return GeneratorIterator(fn, *args, **kwargs) return wrapped
[docs]class Data(Component): """Data is an abstraction that handles all of the following: - Initialize model metadata parameters - Create batches of tensors for model training or prediction It can accomplish these in any way it needs to. The base implementation utilizes `pytext.data.sources.DataSource`, and sends batches to `pytext.data.tensorizers.Tensorizer` to create tensors. The `tensorizers` dict passed to the initializer should be considered something like a signature for the model. Each batch should be a dictionary with the same keys as the `tensorizers` dict, and values should be tensors arranged in the way specified by that tensorizer. The tensorizers dict doubles as a simple baseline implementation of that same signature, but subclasses of Data can override the implementation using other methods. This value is how the model specifies what inputs it's looking for. """ __COMPONENT_TYPE__ = ComponentType.DATA_HANDLER __EXPANSIBLE__ = True
[docs] class Config(Component.Config): #: Specify where training/test/eval data come from. The default value #: will not provide any data. source: DataSource.Config = TSVDataSource.Config() #: How training examples are split into batches for the optimizer. batcher: Batcher.Config = PoolingBatcher.Config() sort_key: Optional[str] = None #: cache numberized result in memory, turn off when CPU memory bound. in_memory: Optional[bool] = True
[docs] @classmethod def from_config( cls, config: Config, schema: Dict[str, Type], tensorizers: Dict[str, Tensorizer], rank=0, world_size=1, init_tensorizers=True, **kwargs, ): data_source_cls = Registry.get(ComponentType.DATA_SOURCE, type(config.source)) if issubclass(data_source_cls, ShardedDataSource): # data source is already sharded, we don't need to wrap RowShardedDataSource data_source = create_component( ComponentType.DATA_SOURCE, config.source, schema, rank=rank, world_size=world_size, ) else: unsharded_data_source = create_component( ComponentType.DATA_SOURCE, config.source, schema ) data_source = RowShardedDataSource( data_source=unsharded_data_source, rank=rank, world_size=world_size ) batcher = create_component(ComponentType.BATCHER, config.batcher) return cls( data_source, tensorizers, batcher=batcher, sort_key=config.sort_key, in_memory=config.in_memory, init_tensorizers=init_tensorizers, **kwargs, )
def __init__( self, data_source: DataSource, tensorizers: Dict[str, Tensorizer], batcher: Batcher = None, sort_key: Optional[str] = None, in_memory: Optional[bool] = True, init_tensorizers: Optional[bool] = True, init_tensorizers_from_scratch: Optional[bool] = True, ): """This function should also initialize the passed in tensorizers with metadata they need for model construction.""" self.data_source = data_source self.tensorizers = tensorizers self.batcher = batcher or Batcher() self.sort_key = sort_key self.in_memory = in_memory self.numberized_cache: MutableMapping[str, Any] = {} self.cache_mutex: Dict[str, bool] = {} full_train_data = ( data_source.train_unsharded if isinstance(data_source, ShardedDataSource) else data_source.train ) if init_tensorizers: initialize_tensorizers( self.tensorizers, full_train_data, init_tensorizers_from_scratch ) else: print( "Skipped initializing tensorizers since they are loaded from a " "previously saved state." ) log_class_usage(__class__)
[docs] def numberize_rows(self, rows): for row in rows: numberized = { name: tensorizer.numberize(row) for name, tensorizer in self.tensorizers.items() } yield RowData(row, numberized)
[docs] def cache(self, numberized_rows, stage): if stage in self.cache_mutex: # already have generator caching the numberized data for numberized_row in numberized_rows: yield numberized_row else: self.cache_mutex[stage] = True result = [] for numberized_row in numberized_rows: result.append(numberized_row) yield numberized_row self.numberized_cache[stage] = result
[docs] def add_row_indices(self, rows): for idx, row in enumerate(rows): row[RawExampleFieldName.ROW_INDEX] = idx yield row
[docs] @generator_iterator def batches(self, stage: Stage, data_source=None, load_early=False): """Create batches of tensors to pass to model train_batch. This function yields dictionaries that mirror the `tensorizers` dict passed to `__init__`, ie. the keys will be the same, and the tensors will be the shape expected from the respective tensorizers. `stage` is used to determine which data source is used to create batches. if data_source is provided, it is used instead of the configured data_sorce this is to allow setting a different data_source for testing a model. Passing in `load_early` = True disables loading all data in memory and using PoolingBatcher, so that we get the first batch as quickly as possible. """ data_source = data_source if data_source is not None else self.data_source rows = { Stage.TRAIN: data_source.train, Stage.TEST: data_source.test, Stage.EVAL: data_source.eval, }[stage] # We add row indices here so that the original order can be reproduced # after shuffling the data if necessary. indexed_rows = self.add_row_indices(rows) # rows and numberized_rows are generators which can iterate over large # datasets; be careful not to do any operations which will expend them. if self.in_memory and not load_early: numberized_rows = self.numberized_cache.get(stage, None) if numberized_rows is None: numberized_rows = self.cache(self.numberize_rows(indexed_rows), stage) else: print(f"Get numberized rows from cache in stage: {stage}") else: numberized_rows = self.numberize_rows(indexed_rows) sort_key = self.sort_key def key(row): return self.tensorizers[sort_key].sort_key(row.numberized[sort_key]) if load_early: batcher = Batcher( self.batcher.train_batch_size, self.batcher.eval_batch_size, self.batcher.test_batch_size, ) else: batcher = self.batcher batches = batcher.batchify( numberized_rows, sort_key=(key if sort_key else None), stage=stage ) return pad_and_tensorize_batches(self.tensorizers, batches)