Source code for pytext.torchscript.tokenizer.tokenizer

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from typing import List, Tuple

import torch

from .bpe import ScriptBPE


[docs]class ScriptTokenizerBase(torch.jit.ScriptModule): @torch.jit.script_method def tokenize(self, input: str) -> List[Tuple[str, int, int]]: """ Process a single line of raw inputs into tokens, it supports two input formats: 1) a single text 2) a token Returns a list of tokens with start and end indices in original input. """ raise NotImplementedError
[docs]class ScriptDoNothingTokenizer(ScriptTokenizerBase): @torch.jit.script_method def tokenize(self, raw_token: str) -> List[Tuple[str, int, int]]: return [(raw_token, -1, -1)]
[docs]class ScriptBPETokenizer(ScriptTokenizerBase): def __init__(self, bpe: ScriptBPE): super().__init__() self.bpe = bpe @torch.jit.script_method def tokenize(self, raw_token: str) -> List[Tuple[str, int, int]]: tokens = torch.jit.annotate(List[Tuple[str, int, int]], []) for bpe_token in self.bpe.bpe_token(raw_token): tokens.append((bpe_token, -1, -1)) return tokens
[docs]class ScriptWordTokenizer(ScriptTokenizerBase): def __init__(self, lowercase=True): super().__init__() self.lowercase = lowercase @torch.jit.script_method def tokenize(self, raw_token: str) -> List[Tuple[str, int, int]]: """ This tokenizers splits a raw_token into its constituent words by splitting the raw_token on space. This function handle multiple spaces between words too. Note: torch scripting doesn't support try-except and since re.finditer uses try in its implemetation regex based tokenization is not supported. """ tokenize_input = raw_token.lower() if self.lowercase else raw_token tokens = tokenize_input.split() torchify_tokens = torch.jit.annotate(List[Tuple[str, int, int]], []) start, end = 0, 0 for token in tokens: start = tokenize_input.find(token, end, -1) end = start + len(token) torchify_tokens.append((token.strip(), start, end)) start = end + 1 return torchify_tokens