Source code for diagnnose.tokenizer.c2i

from typing import Any, Dict, Set

from unidecode import unidecode

from .w2i import W2I


[docs]class C2I(W2I):
    """Vocabulary containing character-level information.

    Adapted from: https://github.com/tensorflow/models/tree/master/research/lm_1b
    """

    def __init__(
        self, w2i: Dict[str, int], max_word_length: int = 50, **kwargs: Any
    ) -> None:
        super().__init__(w2i, **kwargs)
        self._max_word_length = max_word_length
        chars_set: Set[str] = set()

        for word in w2i:
            chars_set |= set(word)

        free_ids = []
        for i in range(256):
            if chr(i) in chars_set:
                continue
            free_ids.append(chr(i))

        if len(free_ids) < 5:
            raise ValueError("Not enough free char ids: %d" % len(free_ids))

        self.eos_char = free_ids[1]  # <end sentence>
        self.bow_char = free_ids[2]  # <begin word>
        self.eow_char = free_ids[3]  # <end word>
        self.pad_char = free_ids[4]  # <padding>

        self._word_char_ids = {}

        for w in self.w2i.keys():
            self._word_char_ids[w] = self._convert_word_to_char_ids(w)

    @property
    def max_word_length(self) -> int:
        return self._max_word_length

    def _convert_word_to_char_ids(self, word: str):
        import numpy as np

        code = np.zeros([self.max_word_length], dtype=np.int32)
        code[:] = ord(self.pad_char)

        if len(word) > self.max_word_length - 2:
            word = word[: self.max_word_length - 2]
        cur_word = self.bow_char + word + self.eow_char
        for j in range(len(cur_word)):
            code[j] = ord(cur_word[j])
        return code.reshape((1, 1, -1))

[docs]    def token_to_char_ids(self, token: str):
        if not all(ord(c) < 256 for c in token):
            token = unidecode(token)

        if token in self._word_char_ids:
            char_ids = self._word_char_ids[token]
        else:
            char_ids = self._convert_word_to_char_ids(token)
            self._word_char_ids[token] = char_ids

        return char_ids