Source code for diagnnose.tokenizer.w2i

from typing import Dict
from warnings import warn


[docs]class W2I(dict): """Provides vocab functionality mapping words to indices. Non-existing tokens are mapped to the id of an unk token that should be present in the vocab file. Parameters ---------- w2i : Dict[str, int] Dictionary that maps strings to indices. This dictionary can be created using `create_vocab`. unk_token : str, optional The unk token to which unknown words will be mapped. Defaults to <unk>. eos_token : str, optional The end-of-sentence token that is used in the corpus. Defaults to <eos>. notify_unk : bool, optional Notify when a requested token is not present in the vocab. Defaults to False. """ def __init__( self, w2i: Dict[str, int], unk_token: str = "<unk>", eos_token: str = "<eos>", pad_token: str = "<pad>", notify_unk: bool = False, ) -> None: super().__init__(w2i) if unk_token not in w2i: warn(f"Unk token {unk_token} not found in provided vocab.") self.unk_token = unk_token self.eos_token = eos_token self.pad_token = pad_token self.unk_idx = w2i.get(unk_token, None) self.notify_unk = notify_unk self.i2w = list(w2i.keys()) @property def w2i(self) -> Dict[str, int]: return self def __missing__(self, key: str) -> int: if self.notify_unk: warn(f"`{key}` is not present in vocab") return self.unk_idx