Source code for diagnnose.corpus.corpus

from typing import List, Optional, Tuple, Union

from torchtext.data import Dataset, Example, Field, Pipeline, RawField
from transformers import PreTrainedTokenizer


[docs]class Corpus(Dataset): def __init__( self, examples: List[Example], fields: List[Tuple[str, Field]], create_pos_tags: bool = False, sen_column: str = "sen", labels_column: Optional[str] = None, ) -> None: super().__init__(examples, fields) self.sen_column = sen_column self.labels_column = labels_column self.tokenizer: Optional[PreTrainedTokenizer] = None if hasattr(self.fields[sen_column], "vocab"): self.tokenizer = self.fields[sen_column].vocab self._attach_sen_ids() if create_pos_tags: self._create_pos_tags()
[docs] @classmethod def create( cls, path: str, header: Optional[List[str]] = None, header_from_first_line: bool = False, to_lower: bool = False, sen_column: str = "sen", labels_column: Optional[str] = None, sep: str = "\t", tokenize_columns: Optional[List[str]] = None, convert_numerical: bool = False, create_pos_tags: bool = False, tokenizer: Optional[PreTrainedTokenizer] = None, ) -> "Corpus": raw_corpus = cls.create_raw_corpus( path, header_from_first_line=header_from_first_line, sep=sep ) header = cls.create_header( header=header, header_from_first_line=header_from_first_line, corpus_path=path, sen_column=sen_column, sep=sep, ) fields = cls.create_fields( header, to_lower=to_lower, sen_column=sen_column, tokenize_columns=tokenize_columns, convert_numerical=convert_numerical, tokenizer=tokenizer, ) examples = cls.create_examples(raw_corpus, fields) return cls( examples, fields, create_pos_tags=create_pos_tags, sen_column=sen_column, labels_column=labels_column, )
[docs] @staticmethod def create_raw_corpus( path: str, header_from_first_line: bool = False, sep: str = "\t" ) -> List[List[str]]: with open(path, encoding="utf8") as f: if header_from_first_line: next(f) raw_corpus = [line.strip().split(sep) for line in f] return raw_corpus
[docs] @staticmethod def create_header( header: Optional[List[str]] = None, header_from_first_line: bool = False, corpus_path: Optional[str] = None, sen_column: str = "sen", sep: str = "\t", ) -> List[str]: if header is None: if header_from_first_line: with open(corpus_path) as f: header = next(f).strip("\n").split(sep) else: header = [sen_column] assert sen_column in header, f"{sen_column} should be part of corpus_header!" return header
[docs] @staticmethod def create_fields( header: List[str], to_lower: bool = False, sen_column: str = "sen", tokenize_columns: Optional[List[str]] = None, convert_numerical: bool = False, tokenizer: Optional[PreTrainedTokenizer] = None, ) -> List[Tuple[str, Field]]: tokenize_columns = tokenize_columns or [sen_column] pipeline = None if convert_numerical: def preprocess_field(s: Union[str, int]) -> Union[str, int]: return int(s) if (isinstance(s, str) and s.isdigit()) else s pipeline = Pipeline(convert_token=preprocess_field) fields = [] for column in header: if column in tokenize_columns: field = Field(batch_first=True, include_lengths=True, lower=to_lower) if tokenizer is not None: attach_tokenizer(field, tokenizer) else: field = RawField(preprocessing=pipeline) field.is_target = False fields.append((column, field)) return fields
[docs] @staticmethod def create_examples( raw_corpus: List[List[str]], fields: List[Tuple[str, Field]] ) -> List[Example]: examples = [Example.fromlist(line, fields) for line in raw_corpus] return examples
[docs] def slice(self, sen_ids: List[int]) -> "Corpus": """Returns a new Corpus only containing examples from sen_ids. Parameters ---------- sen_ids : List[int] List of sentence indices based on which the examples in the current Corpus will be filtered. These indices refer to the sen_idx in the original corpus; the newly sliced corpus will retain the original sen_idx of an Example item. Returns ------- subcorpus : Corpus A new Corpus instance containing the filtered list of Examples. """ examples = [ex for ex in self.examples if ex.sen_idx in sen_ids] subcorpus = Corpus( examples, self.fields, sen_column=self.sen_column, labels_column=self.labels_column, ) return subcorpus
def _attach_sen_ids(self): """ Adds a sentence index field to the Corpus. """ self.fields["sen_idx"] = RawField() self.fields["sen_idx"].is_target = False for sen_idx, item in enumerate(self.examples): setattr(item, "sen_idx", sen_idx) def _create_pos_tags(self): """ Attaches nltk POS tags to the corpus for each sentence. """ import nltk nltk.download("averaged_perceptron_tagger") self.fields["pos_tags"] = RawField() self.fields["pos_tags"].is_target = False print("Tagging corpus...") for item in self.examples: sen = getattr(item, self.sen_column) setattr(item, "pos_tags", [t[1] for t in nltk.pos_tag(sen)])
[docs]def attach_tokenizer(field: Field, tokenizer: PreTrainedTokenizer) -> None: """Creates a tokenizer that is attached to a Corpus Field. Parameters ---------- field : Field Field to which the vocabulary will be attached tokenizer : PreTrainedTokenizer Tokenizer that will convert tokens to their index. """ def preprocess(text: Union[str, List[str]]) -> List[str]: """We only perform the splitting as a preprocessing step. This allows us to still have access to the original tokens, including those that will be mapped to <unk> later. We cast the encoded text back to tokens for debugging purposes, making it easier to inspect an example at a later stage. """ if isinstance(text, list): text = " ".join(text) return tokenizer.convert_ids_to_tokens( tokenizer.encode(text, add_special_tokens=True) ) field.preprocessing = preprocess field.pad_token = tokenizer.pad_token field.vocab = tokenizer field.vocab.stoi = tokenizer.vocab