Source code for diagnnose.tokenizer.w2i

from typing import Dict
from warnings import warn


[docs]class W2I(dict):
    """Provides vocab functionality mapping words to indices.

    Non-existing tokens are mapped to the id of an unk token that should
    be present in the vocab file.

    Parameters
    ----------
    w2i : Dict[str, int]
        Dictionary that maps strings to indices. This dictionary can be
        created using `create_vocab`.
    unk_token : str, optional
        The unk token to which unknown words will be mapped. Defaults to
        <unk>.
    eos_token : str, optional
        The end-of-sentence token that is used in the corpus. Defaults
        to <eos>.
    notify_unk : bool, optional
        Notify when a requested token is not present in the vocab.
        Defaults to False.
    """

    def __init__(
        self,
        w2i: Dict[str, int],
        unk_token: str = "<unk>",
        eos_token: str = "<eos>",
        pad_token: str = "<pad>",
        notify_unk: bool = False,
    ) -> None:
        super().__init__(w2i)

        if unk_token not in w2i:
            warn(f"Unk token {unk_token} not found in provided vocab.")

        self.unk_token = unk_token
        self.eos_token = eos_token
        self.pad_token = pad_token

        self.unk_idx = w2i.get(unk_token, None)

        self.notify_unk = notify_unk

        self.i2w = list(w2i.keys())

    @property
    def w2i(self) -> Dict[str, int]:
        return self

    def __missing__(self, key: str) -> int:
        if self.notify_unk:
            warn(f"`{key}` is not present in vocab")
        return self.unk_idx