Source code for pytext.common.constants

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from enum import Enum


[docs]class Token(str):
    def __eq__(self, other):
        # We don't want to compare as equal to actual strings, but we want to behave
        # like a string code-wise. Don't use `is` comparison because we want
        # Token instances created across picklings to equals-compare
        return isinstance(other, Token) and super().__eq__(other)

    def __init__(self, input_str):
        self.str = input_str
        super().__init__()

    __hash__ = str.__hash__


[docs]class SpecialTokens:
    UNK = Token("__UNKNOWN__")
    PAD = Token("__PAD__")
    BOS = Token("__BEGIN_OF_SENTENCE__")
    EOS = Token("__END_OF_SENTENCE__")
    BOL = Token("__BEGIN_OF_LIST__")
    EOL = Token("__END_OF_LIST__")
    MASK = Token("__MASK__")
    SELFIE_RAW_IMAGE = Token("__RAW_IMAGE__")
    # BOS and EOS is too long for Byte-level Language Model.
    # Todo: find out conbination of bytes with low-frequency and shorter length
    BYTE_BOS = Token("^")
    BYTE_EOS = Token("#")
    BYTE_SPACE = Token(" ")


[docs]class DatasetFieldName:
    DOC_LABEL_FIELD = "doc_label"
    WORD_LABEL_FIELD = "word_label"
    UTTERANCE_FIELD = "utterance"
    TEXT_FIELD = "word_feat"
    SEQ_FIELD = "seq_word_feat"
    DICT_FIELD = "dict_feat"
    RAW_DICT_FIELD = "sparsefeat"
    CHAR_FIELD = "char_feat"
    DENSE_FIELD = "dense_feat"
    CONTEXTUAL_TOKEN_EMBEDDING = "contextual_token_embedding"
    DOC_WEIGHT_FIELD = "doc_weight"
    WORD_WEIGHT_FIELD = "word_weight"
    RAW_WORD_LABEL = "raw_word_label"
    TOKEN_INDICES = "token_indices"
    TOKEN_RANGE = "token_range"
    TOKENS = "tokens"
    LANGUAGE_ID_FIELD = "lang"
    SEQ_LENS = "seq_lens"
    TARGET_SEQ_LENS = "target_seq_lens"
    RAW_SEQUENCE = "raw_sequence"
    SOURCE_SEQ_FIELD = "source_sequence"
    TARGET_SEQ_FIELD = "target_sequence"
    NUM_TOKENS = "num_tokens"


[docs]class PackageFileName:
    SERIALIZED_EMBED = "pretrained_embed_pt_serialized"
    RAW_EMBED = "pretrained_embed_raw"


[docs]class DFColumn:
    DOC_LABEL = "doc_label"
    WORD_LABEL = "word_label"
    UTTERANCE = "text"
    ALIGNMENT = "alignment"
    DICT_FEAT = "dict_feat"
    DENSE_FEAT = "dense_feat"
    RAW_FEATS = "raw_feats"
    MODEL_FEATS = "model_feats"
    DOC_WEIGHT = "doc_weight"
    WORD_WEIGHT = "word_weight"
    TOKEN_RANGE = "token_range"
    LANGUAGE_ID = "lang"
    SOURCE_SEQUENCE = "source_sequence"
    CONTEXT_SEQUENCE = "context_sequence"
    TARGET_SEQUENCE = "target_sequence"
    SOURCE_FEATS = "source_feats"
    TARGET_TOKENS = "target_tokens"
    SEQLOGICAL = "seqlogical"
    TARGET_PROBS = "target_probs"
    TARGET_LOGITS = "target_logits"
    TARGET_LABELS = "target_labels"


[docs]class Padding:
    WORD_LABEL_PAD = "PAD_LABEL"
    WORD_LABEL_PAD_IDX = 0
    DEFAULT_LABEL_PAD_IDX = -1


[docs]class VocabMeta:
    UNK_TOKEN = "<unk>"
    UNK_NUM_TOKEN = f"{UNK_TOKEN}-NUM"
    PAD_TOKEN = "<pad>"
    EOS_TOKEN = "</s>"
    INIT_TOKEN = "<s>"
    PAD_SEQ = "<pad_seq>"
    EOS_SEQ = "</s_seq>"
    INIT_SEQ = "<s_seq>"


[docs]class BatchContext:
    IGNORE_LOSS = "ignore_loss"
    INDEX = "row_index"
    TASK_NAME = "task_name"


[docs]class Stage(Enum):
    TRAIN = "Training"
    EVAL = "Evaluation"
    TEST = "Test"
    OTHERS = "Others"


[docs]class RawExampleFieldName:
    ROW_INDEX = "row_index"