#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import threading
from typing import Dict, List, Optional, Type
from pytext.data.sources.data_source import RootDataSource, SafeFileWrapper
[docs]class CoNLLUPOSDataSource(RootDataSource):
"""DataSource which loads data from CoNLL-U file."""
[docs] class Config(RootDataSource.Config):
#: Name of the language. If not set, languages will be empty.
language: Optional[str] = None
#: Filename of training set. If not set, iteration will be empty.
train_filename: Optional[str] = None
#: Filename of testing set. If not set, iteration will be empty.
test_filename: Optional[str] = None
#: Filename of eval set. If not set, iteration will be empty.
eval_filename: Optional[str] = None
#: Field names for the TSV. If this is not set, the first line of each file
#: will be assumed to be a header containing the field names.
field_names: Optional[List[str]] = None
#: The column delimiter. CoNLL-U file default is \t.
delimiter: str = "\t"
[docs] @classmethod
def from_config(cls, config: Config, schema: Dict[str, Type], **kwargs):
args = config._asdict()
language = args.pop("language")
train_filename = args.pop("train_filename")
test_filename = args.pop("test_filename")
eval_filename = args.pop("eval_filename")
train_file = (
SafeFileWrapper(train_filename, encoding="utf-8", errors="replace")
if train_filename
else None
)
test_file = (
SafeFileWrapper(test_filename, encoding="utf-8", errors="replace")
if test_filename
else None
)
eval_file = (
SafeFileWrapper(eval_filename, encoding="utf-8", errors="replace")
if eval_filename
else None
)
return cls(
language=language,
train_file=train_file,
test_file=test_file,
eval_file=eval_file,
schema=schema,
**args,
**kwargs,
)
def __init__(
self,
language=None,
train_file=None,
test_file=None,
eval_file=None,
field_names=None,
delimiter=Config.delimiter,
**kwargs,
):
super().__init__(**kwargs)
self.delimiter = delimiter
self.language = language
self._train_file = self._read_file(train_file) if train_file else []
self._test_file = self._read_file(test_file) if test_file else []
self._eval_file = self._read_file(eval_file) if eval_file else []
def _read_file(self, input_file):
"""Reads CoNLL-U file"""
words, labels = [], []
for line in input_file.readlines():
tok = line.strip().split(self.delimiter)
# skip comment and empty line, yield if we got a sentence
# CoNLL-U file separates sentences with empty line
if len(tok) < 2 or line[0] == "#":
assert len(words) == len(labels)
if words:
yield {"text": words, "label": labels, "language": self.language}
words, labels = [], []
elif tok[0].isdigit():
word, pos = tok[1], tok[3]
words.append(word)
labels.append(pos)
if len(words) == len(labels) and words:
yield {"text": words, "label": labels, "language": self.language}
[docs] def raw_train_data_generator(self):
return iter(self._train_file)
[docs] def raw_test_data_generator(self):
return iter(self._test_file)
[docs] def raw_eval_data_generator(self):
return iter(self._eval_file)
[docs]class CoNLLUNERFile:
def __init__(self, file, delim, lang):
self.file = file
self.delimiter = delim
self.language = lang
self._access_lock = threading.Lock()
def __iter__(self):
can_acquire = self._access_lock.acquire(blocking=False)
if not can_acquire:
raise Exception("Concurrent iteration not supported")
self.file.seek(0)
try:
words, labels = [], []
for line in self.file.readlines():
line = line.strip()
tok = line.split(self.delimiter)
if not line:
assert len(words) == len(labels)
if words:
yield {
"text": words,
"label": labels,
"language": self.language,
}
words, labels = [], []
elif len(tok) == 2:
word, label = tok
words.append(word)
labels.append(label)
if len(words) == len(labels) and words:
yield {"text": words, "label": labels, "language": self.language}
finally:
self._access_lock.release()
[docs]class CoNLLUNERDataSource(CoNLLUPOSDataSource):
"""
Reads an empty line separated data (word \t label).
This data source supports datasets for NER tasks
"""
def _read_file(self, input_file):
return CoNLLUNERFile(input_file, self.delimiter, self.language)