Source code for diagnnose.syntax.tasks.linzen

import os
import warnings
from typing import Dict, List, NamedTuple, Optional, Tuple

from torchtext.data import Example, Field
from unidecode import unidecode

from diagnnose.corpus import Corpus
from diagnnose.typedefs.syntax import SyntaxEvalCorpora

from ..task import SyntaxEvalTask


[docs]class RawItem(NamedTuple): """ The original corpus structure contains these 18 fields. """ sentence: str orig_sentence: str pos_sentence: str subj: str verb: str subj_pos: str has_rel: str has_nsubj: str verb_pos: str subj_index: str verb_index: str n_intervening: str last_intervening: str n_diff_intervening: str distance: str max_depth: str all_nouns: str nouns_up_to_verb: str
[docs]class LinzenTask(SyntaxEvalTask):
[docs] def initialize( self, path: str, subtasks: Optional[List[str]] = None, items_per_subtask: Optional[int] = 1000, ) -> SyntaxEvalCorpora: """Performs the initialization for the tasks of Linzen et al. (2016) Arxiv link: https://arxiv.org/abs/1611.01368 Repo: https://github.com/TalLinzen/rnn_agreement Parameters ---------- path : str Path to directory containing the Marvin datasets that can be found in the github repo. subtasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions. items_per_subtask : int, optional Number of items that is selected per subtask. If not provided the full subtask set will be used instead. Returns ------- corpora : Dict[str, Corpus] Dictionary mapping a subtask to a Corpus. """ subtasks = subtasks or ["SS", "SP", "PS", "PP", "SPP", "PSS", "SPPP", "PSSS"] corpora: SyntaxEvalCorpora = self._create_corpora( path, subtasks, items_per_subtask ) return corpora
def _create_corpora( self, corpus_path: str, subtasks: List[str], items_per_subtask: Optional[int] ) -> SyntaxEvalCorpora: raw_corpora: Dict[str, List[RawItem]] = self._create_raw_corpora( corpus_path, subtasks ) corpora = {} verb_inflections = self._create_verb_inflections(corpus_path) for condition, items in raw_corpora.items(): corpus = self._create_corpus(items, verb_inflections, items_per_subtask) n_attractors = str(len(condition)) corpora.setdefault(n_attractors, {})[condition] = corpus return corpora def _create_raw_corpora(self, corpus_path, subtasks) -> Dict[str, List[RawItem]]: raw_corpora = {} with open(os.path.join(corpus_path, "agr_50_mostcommon_10K.tsv")) as f: next(f) # skip header for line in f: item: RawItem = RawItem(*line.strip().split("\t")) sva_condition = self._item_to_sva_condition(item) if self._item_to_sva_condition(item) in subtasks: raw_corpora.setdefault(sva_condition, []).append(item) return raw_corpora def _create_corpus( self, items: List[RawItem], verb_inflections: Dict[str, str], items_per_subtask: Optional[int], ) -> Corpus: header = ["sen", "token", "counter_token"] fields = Corpus.create_fields(header, tokenizer=self.tokenizer) examples: List[Optional[Example]] = [ self._item_to_example(item, fields, verb_inflections) for item in items ] examples: List[Example] = list(filter(None, examples)) if items_per_subtask is not None: examples = examples[:items_per_subtask] corpus = Corpus(examples, fields) return corpus @staticmethod def _item_to_sva_condition(item: RawItem) -> str: """Maps an item to an SVA condition, based on the POS tags of the sentence between the subject and the main verb. For example: The_DT men_NNS under_IN the_DT bridge_NN walk_VBP is mapped to "PS": a plural subject with 1 singular attractor. """ pos_sen = item.pos_sentence.split() pos_subsen = pos_sen[int(item.subj_index) - 1 : int(item.verb_index)] pos_mapping = {"NNS": "P", "NN": "S"} sva_condition = "".join([pos_mapping.get(t, "") for t in pos_subsen]) return sva_condition @staticmethod def _item_to_example( item: RawItem, fields: List[Tuple[str, Field]], verb_inflections: Dict[str, str] ) -> Optional[Example]: """Creates an Example containing the subsentence and both forms of the verb. If a verb form is not present in the model vocab, None is returned. """ orig_sentence = item.orig_sentence if not all(ord(c) < 256 for c in orig_sentence): orig_sentence = unidecode(orig_sentence) subsen = orig_sentence.split()[: int(item.verb_index) - 1] opposite_verb = verb_inflections.get(item.verb, None) if opposite_verb is None: return None return Example.fromlist([subsen, item.verb, opposite_verb], fields) def _create_verb_inflections(self, corpus_path: str) -> Dict[str, str]: """Create sing<>plur mappings for all verbs in the model vocab. Mappings are based on the pos-tagged wiki.vocab file of Linzen et al., and the `inflect` library. We only add token mappings if both the singular and plural form is present in the model's vocabulary. """ try: import inflect except ImportError: warnings.warn("`inflect` is needed, can be installed using pip") raise infl_eng = inflect.engine() pos_to_token = {"VBP": [], "VBZ": []} try: with open(os.path.join(corpus_path, "wiki.vocab")) as file: next(file) for line in file: word, pos, _ = line.strip().split() if word in self.tokenizer.vocab and pos in pos_to_token: pos_to_token[pos].append(word) except FileNotFoundError: warnings.warn( "wiki.vocab is expected to be located in the same directory as the full corpus." ) raise verb_inflections = {} for word in pos_to_token["VBZ"]: candidate = infl_eng.plural_verb(word) if candidate in pos_to_token["VBP"]: verb_inflections[candidate] = word verb_inflections[word] = candidate return verb_inflections