#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import re
from typing import List, Optional, Sequence, Tuple
from pytext.common.constants import VocabMeta
from pytext.config import ConfigBase
from pytext.data.featurizer import Featurizer, InputRecord, OutputRecord
[docs]class SimpleFeaturizer(Featurizer):
"""
Simple featurizer for basic tokenization and gazetteer feature alignment.
"""
[docs] class Config(ConfigBase):
sentence_markers: Optional[Tuple[str, str]] = None
lowercase_tokens: bool = True
split_regex: str = r"\s+"
convert_to_bytes: bool = False
[docs] def tokenize(self, input_record: InputRecord) -> OutputRecord:
"""Tokenize one instance/example only."""
tokens: List[str] = []
token_ranges: List[Tuple[int, int]] = []
def add_token(text, start, end):
token = text[start:end]
if token:
tokens.append(token)
token_ranges.append((start, end))
if self.config.convert_to_bytes:
start = 0
text = input_record.raw_text.encode()
for byte in text:
tokens.append(chr(byte))
token_ranges.append((start, start + 1))
start += 1
else:
start = 0
text = input_record.raw_text
for sep in re.finditer(self.config.split_regex, text):
add_token(text, start, sep.start())
start = sep.end()
add_token(text, start, len(text))
if not tokens:
# Add PAD_TOKEN in case of empty text
tokens = [VocabMeta.PAD_TOKEN]
if self.config.lowercase_tokens:
tokens = list(map(str.lower, tokens))
if self.config.sentence_markers:
tokens.insert(0, self.config.sentence_markers[0])
tokens.append(self.config.sentence_markers[1])
characters = [list(tok) for tok in tokens]
# TODO: support remaining features (see OutputRecord)
return OutputRecord(
tokens=tokens, token_ranges=token_ranges, characters=characters
)
[docs] def tokenize_batch(
self, input_records: Sequence[InputRecord]
) -> Sequence[OutputRecord]:
return [self.tokenize(in_record) for in_record in input_records]
[docs] def featurize(self, input_record: InputRecord) -> OutputRecord:
"""Featurize one instance/example only."""
return self.tokenize(input_record)
[docs] def featurize_batch(
self, input_records: Sequence[InputRecord]
) -> Sequence[OutputRecord]:
return [self.featurize(in_record) for in_record in input_records]
[docs] def get_sentence_markers(self, locale=None):
return self.config.sentence_markers