Source code for pytext.data.tokenizers.tokenizer

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import re
from typing import List, NamedTuple

from fairseq.data.encoders.gpt2_bpe import get_encoder as create_gpt2_bpe
from fairseq.data.encoders.gpt2_bpe_utils import Encoder as GPT2BPEEncoder
from pytext.config import ConfigBase
from pytext.config.component import Component, ComponentType, create_component
from pytext.torchscript.tokenizer import ScriptDoNothingTokenizer
from pytext.utils.file_io import PathManager
from pytorch_pretrained_bert.tokenization import (
    BasicTokenizer,
    WordpieceTokenizer,
    load_vocab,
)
from sentencepiece import SentencePieceProcessor


[docs]class Token(NamedTuple): value: str start: int end: int
[docs]class Tokenizer(Component): """A simple regex-splitting tokenizer.""" __COMPONENT_TYPE__ = ComponentType.TOKENIZER __EXPANSIBLE__ = True
[docs] class Config(Component.Config): #: A regular expression for the tokenizer to split on. Tokens are the segments #: between the regular expression matches. The start index is inclusive of the #: unmatched region, and the end index is exclusive (matching the first #: character of the matched split region). split_regex: str = r"\s+" #: Whether token values should be lowercased or not. lowercase: bool = True
[docs] @classmethod def from_config(cls, config: Config): return cls(config.split_regex, config.lowercase)
def __init__(self, split_regex=r"\s+", lowercase=True): super().__init__(None) self.split_regex = split_regex self.lowercase = lowercase
[docs] def tokenize(self, input: str) -> List[Token]: tokens = [] start = 0 tokenize_input = input.lower() if self.lowercase else input for match in re.finditer(self.split_regex, tokenize_input): split_start, split_end = match.span() tokens.append(Token(tokenize_input[start:split_start], start, split_start)) start = split_end tokens.append(Token(tokenize_input[start : len(input)], start, len(input))) return [token for token in tokens if token.value]
[docs] def torchscriptify(self): raise NotImplementedError
[docs]class DoNothingTokenizer(Tokenizer): """ Tokenizer that takes a list of strings and converts to a list of Tokens. Useful in cases where tokenizer is run before-hand """
[docs] class Config(Component.Config): do_nothing: str = ""
[docs] @classmethod def from_config(cls, config: Config): return cls()
def __init__(self): super().__init__(None)
[docs] def tokenize(self, input: List[str]) -> List[Token]: tokens = [Token(token_text, -1, -1) for token_text in input if token_text] return tokens
[docs] def torchscriptify(self): return ScriptDoNothingTokenizer()
[docs]class BERTInitialTokenizer(Tokenizer): """ Basic initial tokenization for BERT. This is run prior to word piece, does white space tokenization in addition to lower-casing and accent removal if specified. """
[docs] class Config(Tokenizer.Config): """Config for this class."""
[docs] @classmethod def from_config(cls, config: Config): basic_tokenizer = BasicTokenizer(do_lower_case=config.lowercase) return cls(basic_tokenizer)
def __init__(self, basic_tokenizer) -> None: self.tokenizer = basic_tokenizer
[docs] def tokenize(self, text): """Tokenizes a piece of text.""" if self.tokenizer.do_lower_case: text = self.tokenizer._run_strip_accents(text.lower()) tokens = self.tokenizer.tokenize(text) end = 0 result = [] for token in tokens: start = text.find(token, end) if start == -1: # safety check, this should not happen start = end end = start + len(token) result.append(Token(token, start, end)) return result
[docs]class WordPieceTokenizer(Tokenizer): """Word piece tokenizer for BERT models."""
[docs] class Config(ConfigBase): basic_tokenizer: BERTInitialTokenizer.Config = BERTInitialTokenizer.Config() wordpiece_vocab_path: str = "/mnt/vol/nlp_technologies/bert/uncased_L-12_H-768_A-12/vocab.txt"
def __init__(self, wordpiece_vocab, basic_tokenizer, wordpiece_tokenizer) -> None: self.vocab = wordpiece_vocab self.basic_tokenizer = basic_tokenizer self.wordpiece_tokenizer = wordpiece_tokenizer
[docs] @classmethod def from_config(cls, config: Config): basic_tokenizer = create_component( ComponentType.TOKENIZER, config.basic_tokenizer ) vocab = load_vocab(config.wordpiece_vocab_path) wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab) return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
[docs] def tokenize(self, input_str: str) -> List[Token]: tokens = [] for token in self.basic_tokenizer.tokenize(input_str): start = token.start for sub_token in self.wordpiece_tokenizer.tokenize(token.value): piece_len = ( len(sub_token) if not sub_token.startswith("##") else (len(sub_token) - 2) # account for ## ) if sub_token == "[UNK]": # this fixes the bug wherein piece_len = 5 for all [UNK] piece_len = len(token.value) end = start + piece_len tokens.append(Token(sub_token, start, end)) start = end return [token for token in tokens if token.value]
[docs]class PickleableGPT2BPEEncoder(GPT2BPEEncoder): """Fairseq's encoder stores the regex module as a local reference on its encoders, which means they can't be saved via pickle.dumps or torch.save. This modified their save/load logic doesn't store the module, and restores the reference after re-inflating.""" def __getstate__(self): # make a shallow copy of state to avoid side effect on the original object state = copy.copy(vars(self)) state.pop("re") return state def __setstate__(self, state): vars(self).update(state) import regex self.re = regex
[docs]class GPT2BPETokenizer(Tokenizer): """Tokenizer for gpt-2 and RoBERTa."""
[docs] class Config(ConfigBase): bpe_encoder_path: str = ( "manifold://pytext_training/tree/static/vocabs/bpe/gpt2/encoder.json" ) bpe_vocab_path: str = ( "manifold://pytext_training/tree/static/vocabs/bpe/gpt2/vocab.bpe" )
[docs] @classmethod def from_config(cls, config: Config): # TODO: T57433776 remove once FairSeq support PathManager config.bpe_encoder_path = PathManager.get_local_path(config.bpe_encoder_path) config.bpe_vocab_path = PathManager.get_local_path(config.bpe_vocab_path) bpe = create_gpt2_bpe(config.bpe_encoder_path, config.bpe_vocab_path) # This hacks the bpe instance to be picklable bpe = copy.copy(bpe) bpe.__class__ = PickleableGPT2BPEEncoder return cls(bpe)
def __init__(self, bpe: GPT2BPEEncoder): self.bpe = bpe
[docs] def tokenize(self, input_str: str) -> List[Token]: bpe_ids = self.bpe.encode(input_str) char_tokens = [self.bpe.decoder[id].lstrip(u"\u0120") for id in bpe_ids] # fix for incorrect decoding of utf-8 chars for i, char_token in enumerate(char_tokens): try: char_tokens[i] = bytearray( [self.bpe.byte_decoder[char] for char in char_token] ).decode("utf-8") # handles BPE breaking a single multi-byte char into pieces except UnicodeDecodeError: continue lengths = [len(token) for token in char_tokens] tokens = [] end = 0 for length, id, char_token in zip(lengths, bpe_ids, char_tokens): start = input_str.find(char_token, end) end = start + length tokens.append(Token(str(id), start, end)) # handles bad start/end indices cascading to subsequent tokens. if len(tokens) > 1 and end < tokens[-2].end: end = tokens[-2].end return [token for token in tokens if token.value]
[docs]class CppProcessorMixin: """Cpp processors like SentencePiece don't pickle well; reload them.""" def _load_processor(self): raise NotImplementedError def __getstate__(self): state = dict(vars(self)) state.pop("processor") return state def __setstate__(self, state): vars(self).update(state) self._load_processor()
[docs]class SentencePieceTokenizer(Tokenizer, CppProcessorMixin): """Sentence piece tokenizer."""
[docs] class Config(ConfigBase): sp_model_path: str = ""
def __init__(self, sp_model_path: str = ""): self.sp_model_path = sp_model_path self._load_processor()
[docs] @classmethod def from_config(cls, config: Config): return cls(config.sp_model_path)
[docs] def tokenize(self, input_str: str) -> List[Token]: pieces = self.processor.EncodeAsPieces(input_str) return [Token(piece, -1, -1) for piece in pieces]
def _load_processor(self): self.processor = SentencePieceProcessor() self.processor.Load(self.sp_model_path)