Source code for pytext.data.sources.conllu

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import threading
from typing import Dict, List, Optional, Type

from pytext.data.sources.data_source import RootDataSource, SafeFileWrapper


[docs]class CoNLLUPOSDataSource(RootDataSource): """DataSource which loads data from CoNLL-U file."""
[docs] class Config(RootDataSource.Config): #: Name of the language. If not set, languages will be empty. language: Optional[str] = None #: Filename of training set. If not set, iteration will be empty. train_filename: Optional[str] = None #: Filename of testing set. If not set, iteration will be empty. test_filename: Optional[str] = None #: Filename of eval set. If not set, iteration will be empty. eval_filename: Optional[str] = None #: Field names for the TSV. If this is not set, the first line of each file #: will be assumed to be a header containing the field names. field_names: Optional[List[str]] = None #: The column delimiter. CoNLL-U file default is \t. delimiter: str = "\t"
[docs] @classmethod def from_config(cls, config: Config, schema: Dict[str, Type], **kwargs): args = config._asdict() language = args.pop("language") train_filename = args.pop("train_filename") test_filename = args.pop("test_filename") eval_filename = args.pop("eval_filename") train_file = ( SafeFileWrapper(train_filename, encoding="utf-8", errors="replace") if train_filename else None ) test_file = ( SafeFileWrapper(test_filename, encoding="utf-8", errors="replace") if test_filename else None ) eval_file = ( SafeFileWrapper(eval_filename, encoding="utf-8", errors="replace") if eval_filename else None ) return cls( language=language, train_file=train_file, test_file=test_file, eval_file=eval_file, schema=schema, **args, **kwargs, )
def __init__( self, language=None, train_file=None, test_file=None, eval_file=None, field_names=None, delimiter=Config.delimiter, **kwargs, ): super().__init__(**kwargs) self.delimiter = delimiter self.language = language self._train_file = self._read_file(train_file) if train_file else [] self._test_file = self._read_file(test_file) if test_file else [] self._eval_file = self._read_file(eval_file) if eval_file else [] def _read_file(self, input_file): """Reads CoNLL-U file""" words, labels = [], [] for line in input_file.readlines(): tok = line.strip().split(self.delimiter) # skip comment and empty line, yield if we got a sentence # CoNLL-U file separates sentences with empty line if len(tok) < 2 or line[0] == "#": assert len(words) == len(labels) if words: yield {"text": words, "label": labels, "language": self.language} words, labels = [], [] elif tok[0].isdigit(): word, pos = tok[1], tok[3] words.append(word) labels.append(pos) if len(words) == len(labels) and words: yield {"text": words, "label": labels, "language": self.language}
[docs] def raw_train_data_generator(self): return iter(self._train_file)
[docs] def raw_test_data_generator(self): return iter(self._test_file)
[docs] def raw_eval_data_generator(self): return iter(self._eval_file)
[docs]class CoNLLUNERFile: def __init__(self, file, delim, lang): self.file = file self.delimiter = delim self.language = lang self._access_lock = threading.Lock() def __iter__(self): can_acquire = self._access_lock.acquire(blocking=False) if not can_acquire: raise Exception("Concurrent iteration not supported") self.file.seek(0) try: words, labels = [], [] for line in self.file.readlines(): line = line.strip() tok = line.split(self.delimiter) if not line: assert len(words) == len(labels) if words: yield { "text": words, "label": labels, "language": self.language, } words, labels = [], [] elif len(tok) == 2: word, label = tok words.append(word) labels.append(label) if len(words) == len(labels) and words: yield {"text": words, "label": labels, "language": self.language} finally: self._access_lock.release()
[docs]class CoNLLUNERDataSource(CoNLLUPOSDataSource): """ Reads an empty line separated data (word \t label). This data source supports datasets for NER tasks """ def _read_file(self, input_file): return CoNLLUNERFile(input_file, self.delimiter, self.language)