#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import unittest
from typing import List
import numpy as np
import torch
from pytext.data.bert_tensorizer import BERTTensorizer, BERTTensorizerScriptImpl
from pytext.data.roberta_tensorizer import (
RoBERTaTensorizer,
RoBERTaTensorizerScriptImpl,
)
from pytext.data.sources import SquadDataSource
from pytext.data.sources.data_source import Gazetteer, SafeFileWrapper, load_float_list
from pytext.data.sources.tsv import SessionTSVDataSource, TSVDataSource
from pytext.data.squad_for_bert_tensorizer import (
SquadForBERTTensorizer,
SquadForRoBERTaTensorizer,
)
from pytext.data.squad_tensorizer import SquadTensorizer
from pytext.data.tensorizers import (
AnnotationNumberizer,
ByteTensorizer,
ByteTokenTensorizer,
FloatListTensorizer,
GazetteerTensorizer,
LabelListTensorizer,
LabelTensorizer,
SeqTokenTensorizer,
TokenTensorizer,
VocabConfig,
VocabFileConfig,
initialize_tensorizers,
lookup_tokens,
)
from pytext.data.tokenizers import (
DoNothingTokenizer,
GPT2BPETokenizer,
Tokenizer,
WordPieceTokenizer,
)
from pytext.data.utils import BOS, EOS, Vocabulary
from pytext.utils.test import import_tests_module
tests_module = import_tests_module()
[docs]class LookupTokensTest(unittest.TestCase):
[docs] def test_lookup_tokens(self):
text = "let's tokenize this"
tokenizer = Tokenizer()
vocab = Vocabulary(text.split() + [BOS, EOS])
tokens, start_idx, end_idx = lookup_tokens(
text, tokenizer=tokenizer, vocab=vocab, bos_token=None, eos_token=None
)
self.assertEqual(tokens, [0, 1, 2])
self.assertEqual(start_idx, (0, 6, 15))
self.assertEqual(end_idx, (5, 14, 19))
tokens, start_idx, end_idx = lookup_tokens(
text, tokenizer=tokenizer, vocab=vocab, bos_token=BOS, eos_token=EOS
)
self.assertEqual(tokens, [3, 0, 1, 2, 4])
self.assertEqual(start_idx, (-1, 0, 6, 15, -1))
self.assertEqual(end_idx, (-1, 5, 14, 19, -1))
[docs]class ListTensorizersTest(unittest.TestCase):
[docs] def setUp(self):
self.data = SessionTSVDataSource(
SafeFileWrapper(tests_module.test_file("seq_tagging_example.tsv")),
field_names=["session_id", "intent", "goal", "label"],
schema={"intent": List[str], "goal": List[str], "label": List[str]},
)
[docs] def test_initialize_list_tensorizers(self):
tensorizers = {
"intent": LabelListTensorizer(
label_column="intent", pad_in_vocab=True, allow_unknown=True
),
"goal": LabelListTensorizer(label_column="goal"),
}
initialize_tensorizers(tensorizers, self.data.train)
self.assertEqual(9, len(tensorizers["intent"].vocab))
self.assertEqual(7, len(tensorizers["goal"].vocab))
[docs] def test_create_label_list_tensors(self):
tensorizers = {
"intent": LabelListTensorizer(
label_column="intent", pad_in_vocab=True, allow_unknown=True
)
}
initialize_tensorizers(tensorizers, self.data.train)
tensors = [tensorizers["intent"].numberize(row) for row in self.data.train]
# test label idx
self.assertEqual([2, 3], tensors[0][0])
self.assertEqual([4, 5], tensors[1][0])
self.assertEqual([6, 7, 8], tensors[2][0])
# test seq lens
self.assertEqual(2, tensors[0][1])
self.assertEqual(2, tensors[1][1])
self.assertEqual(3, tensors[2][1])
self.assertEqual(3, len(tensors))
tensors, lens = tensorizers["intent"].tensorize(tensors)
np.testing.assert_array_almost_equal(
np.array([[2, 3, 1], [4, 5, 1], [6, 7, 8]]), tensors.detach().numpy()
)
np.testing.assert_array_almost_equal(np.array([2, 2, 3]), lens.detach().numpy())
[docs] def test_label_list_tensors_no_pad_in_vocab(self):
tensorizers = {
"intent": LabelListTensorizer(
label_column="intent", pad_in_vocab=False, allow_unknown=True
)
}
initialize_tensorizers(tensorizers, self.data.train)
self.assertEqual(8, len(tensorizers["intent"].vocab))
tensors = []
for row in self.data.train:
row["intent"].append("unknown")
tensors.append(tensorizers["intent"].numberize(row))
tensors, lens = tensorizers["intent"].tensorize(tensors)
np.testing.assert_array_almost_equal(
np.array([[1, 2, 0, -1], [3, 4, 0, -1], [5, 6, 7, 0]]),
tensors.detach().numpy(),
)
# fmt: off
EXPECTED_ACTIONS = [
[0, 1, 1, 2, 1, 3, 3],
[4, 1, 5, 1, 1, 1, 3, 1, 6, 7, 8, 1, 3, 1, 3, 3, 1, 1, 9, 1, 1, 1, 1, 1, 3, 1, 3],
[10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3],
[0, 1, 1, 1, 1, 1, 3],
[11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 7, 1, 3, 3, 1, 5, 12, 1, 3, 3, 1, 1, 1, 9, 1, 1, 1, 3, 1, 3],
[4, 1, 1, 1, 1, 5, 1, 3, 1, 1, 13, 1, 1, 1, 3, 3],
[4, 1, 1, 1, 1, 1, 5, 7, 1, 3, 3, 3],
[14, 1, 1, 1, 6, 1, 3, 1, 5, 1, 3, 3],
[0, 1, 1, 1, 1, 1, 2, 1, 3, 3],
[10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3],
]
# fmt: on
[docs]class TensorizersTest(unittest.TestCase):
[docs] def setUp(self):
self.data = TSVDataSource(
SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")),
eval_file=None,
field_names=["label", "slots", "text", "dense"],
schema={"text": str, "label": str},
)
def _initialize_tensorizer(self, tensorizer, data=None):
if data is None:
data = self.data
init = tensorizer.initialize()
init.send(None) # kick
for row in data.train:
init.send(row)
init.close()
[docs] def test_initialize_tensorizers(self):
tensorizers = {
"tokens": TokenTensorizer(text_column="text"),
"labels": LabelTensorizer(label_column="label"),
"chars": ByteTensorizer(text_column="text"),
}
initialize_tensorizers(tensorizers, self.data.train)
self.assertEqual(49, len(tensorizers["tokens"].vocab))
self.assertEqual(7, len(tensorizers["labels"].vocab))
[docs] def test_initialize_token_tensorizer(self):
# default (build from data)
tensorizer = TokenTensorizer(text_column="text")
self._initialize_tensorizer(tensorizer)
self.assertEqual(49, len(tensorizer.vocab))
# size limit on tokens from data
tensorizer = TokenTensorizer(
text_column="text", vocab_config=VocabConfig(size_from_data=3)
)
self._initialize_tensorizer(tensorizer)
self.assertEqual(5, len(tensorizer.vocab)) # 3 + unk token + pad token
embed_file = tests_module.test_file("pretrained_embed_raw")
# vocab from data + vocab_file
tensorizer = TokenTensorizer(
text_column="text",
vocab_config=VocabConfig(
size_from_data=3,
vocab_files=[
VocabFileConfig(filepath=embed_file, skip_header_line=True)
],
),
)
self._initialize_tensorizer(tensorizer)
self.assertEqual(15, len(tensorizer.vocab))
# vocab just from vocab_file
tensorizer = TokenTensorizer(
text_column="text",
vocab_config=VocabConfig(
build_from_data=False,
vocab_files=[
VocabFileConfig(
filepath=embed_file, skip_header_line=True, size_limit=5
)
],
),
)
init = tensorizer.initialize()
# Should skip initialization
with self.assertRaises(StopIteration):
init.send(None)
self.assertEqual(7, len(tensorizer.vocab)) # 5 + unk token + pad token
[docs] def test_create_word_tensors(self):
tensorizer = TokenTensorizer(text_column="text")
self._initialize_tensorizer(tensorizer)
rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}]
tensors = (tensorizer.numberize(row) for row in rows)
tokens, seq_len, token_ranges = next(tensors)
self.assertEqual([24, 0, 0, 0], tokens)
self.assertEqual(4, seq_len)
self.assertEqual([(0, 1), (2, 6), (7, 11), (12, 18)], token_ranges)
tokens, seq_len, token_ranges = next(tensors)
self.assertEqual([13, 47, 9], tokens)
self.assertEqual(3, seq_len)
self.assertEqual([(0, 4), (5, 7), (8, 10)], token_ranges)
[docs] def test_create_byte_tensors(self):
tensorizer = ByteTensorizer(text_column="text", lower=False)
# not initializing because initializing is a no-op for ByteTensorizer
s1 = "I want some coffee"
s2 = "Turn it up"
s3 = "我不会说中文"
rows = [{"text": s1}, {"text": s2}, {"text": s3}]
expected = [list(s1.encode()), list(s2.encode()), list(s3.encode())]
tensors = [tensorizer.numberize(row) for row in rows]
self.assertEqual([(bytes, len(bytes)) for bytes in expected], tensors)
[docs] def test_byte_tensors_error_code(self):
tensorizer = ByteTensorizer(
text_column="text", lower=False, add_bos_token=True, add_eos_token=True
)
s1 = "I want some coffee#"
s2 = "This is ^the best show I've ever seen"
rows = [{"text": s1}, {"text": s2}]
expected_error_code = 1
with self.assertRaises(SystemExit) as cm:
for row in rows:
tensorizer.numberize(row)
self.assertEqual(cm.exception.code, expected_error_code)
[docs] def test_create_byte_token_tensors(self):
tensorizer = ByteTokenTensorizer(
text_column="text", max_seq_len=4, max_byte_len=5
)
# not initializing because initializing is a no-op for this tensorizer
s1 = "I want some coffee today"
s2 = "Turn it up"
def ords(word, pad_to):
return list(word.encode()) + [0] * (pad_to - len(word))
batch = [{"text": s1}, {"text": s2}]
# Note that the tokenizer lowercases here
expected = [
[ords("i", 5), ords("want", 5), ords("some", 5), ords("coffe", 5)],
[ords("turn", 5), ords("it", 5), ords("up", 5), ords("", 5)],
]
expected_token_lens = [4, 3]
expected_byte_lens = [[1, 4, 4, 5], [4, 2, 2, 0]]
bytes, token_lens, byte_lens = tensorizer.tensorize(
[tensorizer.numberize(row) for row in batch]
)
self.assertIsInstance(bytes, torch.LongTensor)
self.assertIsInstance(token_lens, torch.LongTensor)
self.assertIsInstance(byte_lens, torch.LongTensor)
self.assertEqual((2, 4, 5), bytes.size())
self.assertEqual((2,), token_lens.size())
self.assertEqual((2, 4), byte_lens.size())
self.assertEqual(expected, bytes.tolist())
self.assertEqual(expected_token_lens, token_lens.tolist())
self.assertEqual(expected_byte_lens, byte_lens.tolist())
[docs] def test_initialize_label_tensorizer(self):
tensorizer = LabelTensorizer(label_column="label")
self._initialize_tensorizer(tensorizer)
self.assertEqual(7, len(tensorizer.vocab))
[docs] def test_create_label_tensors(self):
tensorizer = LabelTensorizer(label_column="label")
self._initialize_tensorizer(tensorizer)
rows = [
{"label": "weather/find"},
{"label": "alarm/set_alarm"},
{"label": "non/existent"},
]
tensors = (tensorizer.numberize(row) for row in rows)
tensor = next(tensors)
self.assertEqual(6, tensor)
tensor = next(tensors)
self.assertEqual(1, tensor)
with self.assertRaises(Exception):
tensor = next(tensors)
[docs] def test_gazetteer_tensor_bad_json(self):
tensorizer = GazetteerTensorizer()
data = TSVDataSource(
train_file=SafeFileWrapper(
tests_module.test_file("train_dict_features_bad_json.tsv")
),
test_file=None,
eval_file=None,
field_names=["text", "dict"],
schema={"text": str, "dict": Gazetteer},
)
init = tensorizer.initialize()
init.send(None) # kick
with self.assertRaises(Exception):
for row in data.train:
init.send(row)
init.close()
[docs] def test_gazetteer_tensor(self):
tensorizer = GazetteerTensorizer()
data = TSVDataSource(
train_file=SafeFileWrapper(
tests_module.test_file("train_dict_features.tsv")
),
test_file=None,
eval_file=None,
field_names=["text", "dict"],
schema={"text": str, "dict": Gazetteer},
)
self._initialize_tensorizer(tensorizer, data)
# UNK + PAD + 5 labels
self.assertEqual(7, len(tensorizer.vocab))
# only two rows in test file:
# "Order coffee from Starbucks please"
# "Order some fries from McDonalds please"
for i, row in enumerate(data.train):
if i == 0:
idx, weights, lens = tensorizer.numberize(row)
self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx)
self.assertEqual(
[0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], weights
)
self.assertEqual([1, 2, 1, 1, 1], lens)
if i == 1:
idx, weights, lens = tensorizer.numberize(row)
self.assertEqual([1, 1, 5, 1, 6, 1], idx)
self.assertEqual([0.0, 0.0, 1.0, 0.0, 1.0, 0.0], weights)
self.assertEqual([1, 1, 1, 1, 1, 1], lens)
feats, weights, lens = tensorizer.tensorize(
tensorizer.numberize(row) for row in data.train
)
self.assertEqual(
[
[1, 1, 2, 3, 1, 1, 4, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 5, 1, 1, 1, 6, 1, 1, 1],
],
feats.numpy().tolist(),
)
self.assertEqual(
str(
[
[0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
]
),
str(
[[round(w, 2) for w in utt_weights] for utt_weights in weights.numpy()]
),
)
self.assertEqual(
[[1, 2, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], lens.numpy().tolist()
)
[docs] def test_seq_tensor(self):
tensorizer = SeqTokenTensorizer()
data = TSVDataSource(
train_file=SafeFileWrapper(
tests_module.test_file("train_seq_features.tsv")
),
test_file=None,
eval_file=None,
field_names=["text_seq"],
schema={"text_seq": List[str]},
)
self._initialize_tensorizer(tensorizer, data)
# UNK + PAD + 6 tokens
self.assertEqual(8, len(tensorizer.vocab))
# only one row in test file:
# ["where do you wanna meet?", "MPK"]
for row in data.train:
tokens, token_lens = tensorizer.prepare_input(row)
idx, lens = tensorizer.numberize(row)
self.assertEqual(2, lens)
self.assertEqual([[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]], idx)
self.assertEqual(2, token_lens)
self.assertEqual(
[
["where", "do", "you", "wanna", "meet?"],
["mpk", "__PAD__", "__PAD__", "__PAD__", "__PAD__"],
],
tokens,
)
[docs] def test_seq_tensor_with_bos_eos_eol_bol(self):
tensorizer = SeqTokenTensorizer(
add_bos_token=True,
add_eos_token=True,
add_bol_token=True,
add_eol_token=True,
)
data = TSVDataSource(
train_file=SafeFileWrapper(
tests_module.test_file("train_seq_features.tsv")
),
test_file=None,
eval_file=None,
field_names=["text_seq"],
schema={"text_seq": List[str]},
)
self._initialize_tensorizer(tensorizer, data)
# UNK + PAD + BOS + EOS + BOL + EOL + 6 tokens
self.assertEqual(12, len(tensorizer.vocab))
# only one row in test file:
# ["where do you wanna meet?", "MPK"]
for row in data.train:
idx, lens = tensorizer.numberize(row)
tokens, token_lens = tensorizer.prepare_input(row)
self.assertEqual(4, lens)
self.assertEqual(4, token_lens)
self.assertEqual(
[
[2, 4, 3, 1, 1, 1, 1],
[2, 6, 7, 8, 9, 10, 3],
[2, 11, 3, 1, 1, 1, 1],
[2, 5, 3, 1, 1, 1, 1],
],
idx,
)
self.assertEqual(
[
[
"__BEGIN_OF_SENTENCE__",
"__BEGIN_OF_LIST__",
"__END_OF_SENTENCE__",
"__PAD__",
"__PAD__",
"__PAD__",
"__PAD__",
],
[
"__BEGIN_OF_SENTENCE__",
"where",
"do",
"you",
"wanna",
"meet?",
"__END_OF_SENTENCE__",
],
[
"__BEGIN_OF_SENTENCE__",
"mpk",
"__END_OF_SENTENCE__",
"__PAD__",
"__PAD__",
"__PAD__",
"__PAD__",
],
[
"__BEGIN_OF_SENTENCE__",
"__END_OF_LIST__",
"__END_OF_SENTENCE__",
"__PAD__",
"__PAD__",
"__PAD__",
"__PAD__",
],
],
tokens,
)
[docs] def test_create_float_list_tensor(self):
tensorizer = FloatListTensorizer(
column="dense", dim=2, error_check=True, normalize=False
)
tests = [
("[0.1,0.2]", [0.1, 0.2]), # comma
("[0.1, 0.2]", [0.1, 0.2]), # comma with single space
("[0.1, 0.2]", [0.1, 0.2]), # comma with multiple spaces
("[0.1 0.2]", [0.1, 0.2]), # space
("[0.1 0.2]", [0.1, 0.2]), # multiple spaces
("[ 0.1 0.2]", [0.1, 0.2]), # space after [
("[0.1 0.2 ]", [0.1, 0.2]), # space before ]
("[0. 1.]", [0.0, 1.0]), # 0., 1.
]
for raw, expected in tests:
row = {"dense": load_float_list(raw)}
numberized = tensorizer.numberize(row)
self.assertEqual(expected, numberized)
[docs] def test_create_normalized_float_list_tensor(self):
def round_list(l):
return [float("%.4f" % n) for n in l]
data = TSVDataSource(
SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
eval_file=None,
field_names=["label", "slots", "text", "dense_feat"],
schema={"text": str, "label": str, "dense_feat": List[float]},
)
tensorizer = FloatListTensorizer(
column="dense_feat", dim=10, error_check=True, normalize=True
)
self._initialize_tensorizer(tensorizer, data)
self.assertEqual(10, tensorizer.normalizer.num_rows)
self.assertEqual(
round_list(
[
7.56409,
8.2388,
0.5531,
0.2403,
1.03130,
6.2888,
3.1595,
0.1538,
0.2403,
5.3463,
]
),
round_list(tensorizer.normalizer.feature_sums),
)
self.assertEqual(
round_list(
[
5.80172,
7.57586,
0.30591,
0.05774,
0.52762,
5.22811,
2.51727,
0.02365,
0.05774,
4.48798,
]
),
round_list(tensorizer.normalizer.feature_squared_sums),
)
self.assertEqual(
round_list(
[
0.75640,
0.82388,
0.05531,
0.02403,
0.10313,
0.62888,
0.31595,
0.01538,
0.02403,
0.53463,
]
),
round_list(tensorizer.normalizer.feature_avgs),
)
self.assertEqual(
round_list(
[
0.08953,
0.28072,
0.16593,
0.07209,
0.20524,
0.35682,
0.38974,
0.04614,
0.07209,
0.40369,
]
),
round_list(tensorizer.normalizer.feature_stddevs),
)
row = [0.64840776, 0.7575, 0.5531, 0.2403, 0, 0.9481, 0, 0.1538, 0.2403, 0.3564]
output = tensorizer.numberize({"dense_feat": row})
self.assertEqual(
round_list(
[
-1.20619,
-0.23646,
2.99999,
3.0,
-0.50246,
0.89462,
-0.81066,
2.99999,
3.0,
-0.44149,
]
),
round_list(output),
)
[docs] def test_annotation_num(self):
data = TSVDataSource(
SafeFileWrapper(tests_module.test_file("compositional_seq2seq_unit.tsv")),
test_file=None,
eval_file=None,
field_names=["text", "seqlogical"],
schema={"text": str, "seqlogical": str},
)
nbrz = AnnotationNumberizer()
self._initialize_tensorizer(nbrz, data)
# vocab = {'IN:GET_INFO_TRAFFIC': 0, 'SHIFT': 1, 'SL:LOCATION': 2,
# 'REDUCE': 3, 'IN:GET_DIRECTIONS': 4, 'SL:DESTINATION': 5, 'SL:SOURCE': 6,
# 'IN:GET_LOCATION_HOME': 7, 'SL:CONTACT': 8, 'SL:DATE_TIME_DEPARTURE': 9,
# 'IN:UNSUPPORTED_NAVIGATION': 10, 'IN:GET_ESTIMATED_DURATION': 11,
# 'IN:GET_LOCATION_WORK': 12, 'SL:PATH_AVOID': 13, 'IN:GET_DISTANCE': 14}
self.assertEqual(15, len(nbrz.vocab))
self.assertEqual(1, nbrz.shift_idx)
self.assertEqual(3, nbrz.reduce_idx)
self.assertEqual([10], nbrz.ignore_subNTs_roots)
self.assertEqual(
[0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], nbrz.valid_NT_idxs
)
self.assertEqual([0, 4, 7, 10, 11, 12, 14], nbrz.valid_IN_idxs)
self.assertEqual([2, 5, 6, 8, 9, 13], nbrz.valid_SL_idxs)
for row, expected in zip(data.train, EXPECTED_ACTIONS):
actions = nbrz.numberize(row)
self.assertEqual(expected, actions)
[docs]class BERTTensorizerTest(unittest.TestCase):
[docs] def test_bert_tensorizer(self):
sentence = "<SOS> Focus Driving School Mulungushi bus station along Kasuba road, wamkopeka building. Ndola, Zambia."
# expected result was obtained offline by running BertModelDataHandler
expected = [
101,
133,
278,
217,
135,
175,
287,
766,
462,
100,
379,
182,
459,
334,
459,
280,
504,
462,
425,
283,
171,
462,
567,
474,
180,
262,
217,
459,
931,
262,
913,
117,
192,
262,
407,
478,
287,
744,
263,
478,
262,
560,
119,
183,
282,
287,
843,
117,
195,
262,
407,
931,
566,
119,
102,
]
row = {"text": sentence}
tensorizer = BERTTensorizer.from_config(
BERTTensorizer.Config(
tokenizer=WordPieceTokenizer.Config(
wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
)
)
)
tensorizer_impl = BERTTensorizerScriptImpl(
tokenizer=DoNothingTokenizer(),
vocab=tensorizer.vocab,
max_seq_len=tensorizer.max_seq_len,
).torchscriptify()
tokens, segment_label, seq_len, positions = tensorizer.numberize(row)
self.assertEqual(tokens, expected)
self.assertEqual(seq_len, len(expected))
self.assertEqual(segment_label, [0] * len(expected))
tokens, pad_mask, segment_labels, _ = tensorizer.tensorize(
[(tokens, segment_label, seq_len, positions)]
)
self.assertEqual(pad_mask[0].tolist(), [1] * len(expected))
per_sentence_tokens = [tensorizer.tokenizer.tokenize(sentence)]
tokens, segment_label, seq_len, positions = tensorizer_impl.numberize(
per_sentence_tokens
)
self.assertEqual(tokens, expected)
self.assertEqual(seq_len, len(expected))
self.assertEqual(segment_label, [0] * len(expected))
tokens, pad_mask, segment_labels, _ = tensorizer_impl.tensorize(
[tokens], [segment_label], [seq_len], [positions]
)
self.assertEqual(pad_mask[0].tolist(), [1] * len(expected))
[docs] def test_bert_pair_tensorizer(self):
sentences = ["Focus", "Driving School"]
expected_tokens = [101, 175, 287, 766, 462, 102, 100, 379, 102]
expected_segment_labels = [0, 0, 0, 0, 0, 0, 1, 1, 1]
row = {"text1": sentences[0], "text2": sentences[1]}
tensorizer = BERTTensorizer.from_config(
BERTTensorizer.Config(
columns=["text1", "text2"],
tokenizer=WordPieceTokenizer.Config(
wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
),
)
)
tokens, segment_labels, seq_len, _ = tensorizer.numberize(row)
self.assertEqual(tokens, expected_tokens)
self.assertEqual(segment_labels, expected_segment_labels)
self.assertEqual(seq_len, len(expected_tokens))
[docs]class RobertaTensorizerTest(unittest.TestCase):
[docs] def test_roberta_tensorizer(self):
text = "Prototype"
tokens = [[0, 4, 5, 2]]
pad_masks = [[1, 1, 1, 1]]
segment_labels = [[0, 0, 0, 0]]
positions = [[0, 1, 2, 3]]
expected = [tokens, pad_masks, segment_labels, positions]
tensorizer = RoBERTaTensorizer.from_config(
RoBERTaTensorizer.Config(
tokenizer=GPT2BPETokenizer.Config(
bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
),
vocab_file="pytext/data/test/data/gpt2_dict.txt",
max_seq_len=256,
)
)
tensors = tensorizer.tensorize([tensorizer.numberize({"text": text})])
for tensor, expect in zip(tensors, expected):
self.assertEqual(tensor.tolist(), expect)
tensorizer_impl = RoBERTaTensorizerScriptImpl(
tokenizer=DoNothingTokenizer(),
vocab=tensorizer.vocab,
max_seq_len=tensorizer.max_seq_len,
)
script_tensorizer_impl = tensorizer_impl.torchscriptify()
per_sentence_tokens = [tensorizer.tokenizer.tokenize(text)]
tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d = zip(
*[script_tensorizer_impl.numberize(per_sentence_tokens)]
)
script_tensors = script_tensorizer_impl.tensorize(
tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d
)
for tensor, expect in zip(script_tensors, expected):
self.assertEqual(tensor.tolist(), expect)
# test it is able to call torchscriptify multiple time
tensorizer_impl.torchscriptify()
[docs]class SquadForRobertaTensorizerTest(unittest.TestCase):
[docs] def test_squad_roberta_tensorizer(self):
row = {
"id": 0,
"doc": "Prototype",
"question": "otype",
"answers": ["Prot"],
"answer_starts": [0],
"has_answer": True,
}
tensorizer = SquadForRoBERTaTensorizer.from_config(
SquadForRoBERTaTensorizer.Config(
tokenizer=GPT2BPETokenizer.Config(
bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
),
vocab_file="pytext/data/test/data/gpt2_dict.txt",
max_seq_len=250,
)
)
tokens, segments, seq_len, positions, start, end = tensorizer.numberize(row)
# check against manually verified answer positions in tokenized output
# there are 4 identical answers
self.assertEqual(start, [5])
self.assertEqual(end, [5])
self.assertEqual(len(tokens), seq_len)
self.assertEqual(len(segments), seq_len)
[docs]class SquadForBERTTensorizerTest(unittest.TestCase):
[docs] def test_squad_tensorizer(self):
source = SquadDataSource.from_config(
SquadDataSource.Config(
eval_filename=tests_module.test_file("squad_tiny.json")
)
)
row = next(iter(source.eval))
tensorizer = SquadForBERTTensorizer.from_config(
SquadForBERTTensorizer.Config(
tokenizer=WordPieceTokenizer.Config(
wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
),
max_seq_len=250,
)
)
tokens, segments, seq_len, positions, start, end = tensorizer.numberize(row)
# check against manually verified answer positions in tokenized output
# there are 4 identical answers
self.assertEqual(start, [83, 83, 83, 83])
self.assertEqual(end, [87, 87, 87, 87])
self.assertEqual(len(tokens), seq_len)
self.assertEqual(len(segments), seq_len)
tensorizer.max_seq_len = 50
# answer should be truncated out
_, _, _, _, start, end = tensorizer.numberize(row)
self.assertEqual(start, [-100, -100, -100, -100])
self.assertEqual(end, [-100, -100, -100, -100])
self.assertEqual(len(tokens), seq_len)
self.assertEqual(len(segments), seq_len)
[docs]class SquadTensorizerTest(unittest.TestCase):
[docs] def setUp(self):
self.json_data_source = SquadDataSource.from_config(
SquadDataSource.Config(
train_filename=tests_module.test_file("squad_tiny.json"),
eval_filename=None,
test_filename=None,
)
)
self.tsv_data_source = SquadDataSource.from_config(
SquadDataSource.Config(
train_filename=tests_module.test_file("squad_tiny.tsv"),
eval_filename=None,
test_filename=None,
)
)
self.tensorizer_with_wordpiece = SquadTensorizer.from_config(
SquadTensorizer.Config(
tokenizer=WordPieceTokenizer.Config(
wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
),
max_seq_len=250,
)
)
self.tensorizer_with_alphanumeric = SquadTensorizer.from_config(
SquadTensorizer.Config(
tokenizer=Tokenizer.Config(split_regex=r"\W+"), max_seq_len=250
)
)
def _init_tensorizer(self, tsv=False):
tensorizer_dict = {
"wordpiece": self.tensorizer_with_wordpiece,
"alphanumeric": self.tensorizer_with_alphanumeric,
}
data_source = self.tsv_data_source.train if tsv else self.json_data_source.train
initialize_tensorizers(tensorizer_dict, data_source)
[docs] def test_initialize(self):
self._init_tensorizer()
self.assertEqual(len(self.tensorizer_with_wordpiece.vocab), 1000)
self.assertEqual(
len(self.tensorizer_with_wordpiece.ques_tensorizer.vocab), 1000
)
self.assertEqual(len(self.tensorizer_with_wordpiece.doc_tensorizer.vocab), 1000)
self.assertEqual(len(self.tensorizer_with_alphanumeric.vocab), 1418)
self.assertEqual(
len(self.tensorizer_with_alphanumeric.ques_tensorizer.vocab), 1418
)
self.assertEqual(
len(self.tensorizer_with_alphanumeric.doc_tensorizer.vocab), 1418
)
[docs] def test_numberize_with_alphanumeric(self):
self._init_tensorizer()
row = next(iter(self.json_data_source.train))
(
doc_tokens,
doc_seq_len,
ques_tokens,
ques_seq_len,
answer_start_token_idx,
answer_end_token_idx,
) = self.tensorizer_with_alphanumeric.numberize(row)
# check against manually verified answer positions in tokenized output
# there are 4 identical answers
self.assertEqual(len(ques_tokens), ques_seq_len)
self.assertEqual(len(doc_tokens), doc_seq_len)
self.assertEqual(ques_tokens, [2, 3, 4, 5, 6, 7]) # It's a coincidence.
self.assertEqual(answer_start_token_idx, [26, 26, 26, 26])
self.assertEqual(answer_end_token_idx, [26, 26, 26, 26])
self.tensorizer_with_alphanumeric.doc_tensorizer.max_seq_len = 20
# answer should be truncated out because max doc len is smaller.
(
doc_tokens,
doc_seq_len,
ques_tokens,
ques_seq_len,
answer_start_token_idx,
answer_end_token_idx,
) = self.tensorizer_with_alphanumeric.numberize(row)
self.assertEqual(len(ques_tokens), ques_seq_len)
self.assertEqual(len(doc_tokens), doc_seq_len)
self.assertEqual(answer_start_token_idx, [-100])
self.assertEqual(answer_end_token_idx, [-100])
[docs] def test_numberize_with_wordpiece(self):
self._init_tensorizer()
row = next(iter(self.json_data_source.train))
(
doc_tokens,
doc_seq_len,
ques_tokens,
ques_seq_len,
answer_start_token_idx,
answer_end_token_idx,
) = self.tensorizer_with_wordpiece.numberize(row)
# check against manually verified answer positions in tokenized output
# there are 4 identical answers
self.assertEqual(len(ques_tokens), ques_seq_len)
self.assertEqual(len(doc_tokens), doc_seq_len)
self.assertEqual(answer_start_token_idx, [70, 70, 70, 70])
self.assertEqual(answer_end_token_idx, [74, 74, 74, 74])
self.tensorizer_with_wordpiece.doc_tensorizer.max_seq_len = 50
# answer should be truncated out because max doc len is smaller.
(
doc_tokens,
doc_seq_len,
ques_tokens,
ques_seq_len,
answer_start_token_idx,
answer_end_token_idx,
) = self.tensorizer_with_wordpiece.numberize(row)
self.assertEqual(len(ques_tokens), ques_seq_len)
self.assertEqual(len(doc_tokens), doc_seq_len)
self.assertEqual(answer_start_token_idx, [-100])
self.assertEqual(answer_end_token_idx, [-100])
[docs] def test_tsv_numberize_with_alphanumeric(self):
# No need to repeat other tests with TSV.
# All we want to test is that TSV and JSON loading are identical.
self._init_tensorizer(tsv=True)
row = next(iter(self.json_data_source.train))
print(row)
(
doc_tokens,
doc_seq_len,
ques_tokens,
ques_seq_len,
answer_start_token_idx,
answer_end_token_idx,
) = self.tensorizer_with_alphanumeric.numberize(row)
# check against manually verified answer positions in tokenized output
# there are 4 identical answers
self.assertEqual(len(ques_tokens), ques_seq_len)
self.assertEqual(len(doc_tokens), doc_seq_len)
self.assertEqual(ques_tokens, [2, 3, 4, 5, 6, 7]) # It's a coincidence.
self.assertEqual(answer_start_token_idx, [26, 26, 26, 26])
self.assertEqual(answer_end_token_idx, [26, 26, 26, 26])
self.tensorizer_with_alphanumeric.doc_tensorizer.max_seq_len = 20
# answer should be truncated out because max doc len is smaller.
(
doc_tokens,
doc_seq_len,
ques_tokens,
ques_seq_len,
answer_start_token_idx,
answer_end_token_idx,
) = self.tensorizer_with_alphanumeric.numberize(row)
self.assertEqual(len(ques_tokens), ques_seq_len)
self.assertEqual(len(doc_tokens), doc_seq_len)
self.assertEqual(answer_start_token_idx, [-100])
self.assertEqual(answer_end_token_idx, [-100])