Source code for diagnnose.syntax.tasks.warstadt_preproc

import re
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union

# sen_id -> (licensor, scope, npi_present) -> item
ItemCondition = Tuple[int, int, int]
CorpusItem = Dict[ItemCondition, Dict[str, Any]]
CorpusDict = Dict[int, CorpusItem]

ENVS = [
    "adverbs",
    "conditional",
    "determiner_negation_biclausal",
    "only",
    "quantifier",
    "questions",
    "sentential_negation_biclausal",
    "simplequestions",
    "superlative",
]


[docs]def preproc_warstadt(path: str) -> CorpusDict:
    """Reads and preprocesses the NPI corpus of Warstadt et al. (2019).

    Paper: https://arxiv.org/pdf/1901.03438.pdf

    Data: https://alexwarstadt.files.wordpress.com/2019/08/npi_lincensing_data.zip

    Parameters
    ----------
    path : str
        Path to .tsv corpus file.

    Returns
    -------
    sen_id2items : CorpusDict
        Dictionary mapping a sen_id to a triplet (licensor, scope,
        npi_present) to the full corpus item.
    env2sen_ids : EnvIdDict
        Dictionary mapping each env type to a list of sen_id's of that
        type.
    """
    with open(path) as f:
        lines = [line[:-1].split("\t") for line in f]

    def preproc(s):
        return int(s) if s.isnumeric() else s

    # Separate punctuation from adjacent tokens
    pattern = re.compile(r"[.,?]")

    def preproc_sen(s):
        return pattern.sub(lambda m: f" {m.group(0)}", s).split()

    # map each line to a dictionary
    raw_items: List[Dict[str, Any]] = [
        {
            **{k: preproc(v) for k, v in [x.split("=") for x in line[0].split("-")]},
            "correct": bool(int(line[1])),
            "sen": preproc_sen(line[-1]),
        }
        for line in lines
    ]

    extra_idx = 0

    for idx, item in enumerate(raw_items):
        # Remove apostrophes that wrap the full sentence.
        if item["sen"][0][0] == '"':
            item["sen"][0] = item["sen"][0][1:]
        if item["sen"][-1][-1] == '"':
            item["sen"][-1] = item["sen"][-1][:-1]

        sen_id = idx // 8
        # There exist only 4 instead of 8 conditions for `simplequestions`.
        if item["env"] == "simplequestions":
            extra_idx += idx % 8 == 4
        item["sen_id"] = sen_id + extra_idx

        # Cut multi-word NPIs into pieces.
        if item["npi_present"] == 1:
            if item["npi"] in ["atall", "inyears"]:
                item["npi"] = f"{item['npi'][:2]} {item['npi'][2:]}"

    sen_id2items = defaultdict(dict)

    for item in raw_items:
        sen_id2items[item["sen_id"]][
            item["licensor"], item["scope"], item["npi_present"]
        ] = item

    return sen_id2items


[docs]def create_downstream_corpus(
    orig_corpus: Union[str, CorpusDict],
    output_path: Optional[str] = None,
    conditions: Optional[List[Tuple[int, int, int]]] = None,
    envs: Optional[List[str]] = None,
    skip_duplicate_items: bool = False,
) -> List[str]:
    """Create a new corpus from the original one that contains the
    subsentences up to the position of the NPI.

    Parameters
    ----------
    orig_corpus : str | CorpusDict
        Either the path to the original corpus, or a CorpusDict that
        has been created using `preproc_warstadt`.
    output_path : str, optional
        Path to the output file that will be created in .tsv format.
        If not provided the corpus won't be written to disk.
    conditions : List[Tuple[int, int, int]], optional
        List of corpus item conditions (licensor, scope, npi_present).
        If not provided the correct NPI cases (1, 1, 1) will be used.
    envs : List[str], optional
        List of of licensing environments that should be used.
    skip_duplicate_items : bool
        Some corpus items only differ in their post-NPI content, and
        will lead to equivalent results on a downstream task. Defaults
        to False.

    Returns
    -------
    corpus : List[str]
        List of strings representing each corpus item. Note that the
        first line of the list contains the .tsv header.
    """
    if envs is None:
        envs = ENVS
    if conditions is None:
        conditions = [(1, 1, 1)]

    if isinstance(orig_corpus, str):
        id2items = preproc_warstadt(orig_corpus)
    else:
        id2items = orig_corpus

    sens_seen = set()
    corpus = [
        "\t".join(
            [
                "orig_sen_idx",
                "condition",
                "sen",
                "counter_sen",
                "token",
                "full_npi",
                "env",
                "labels",
            ]
        )
    ]

    for idx, items in id2items.items():
        if items[1, 1, 1]["env"] not in envs:
            continue

        for condition in conditions:
            licensor, scope, npi_present = condition
            item = items[condition]

            # Flip licensor bool to create an item of the opposite licensing polarity.
            counter_condition = (int(not licensor), scope, npi_present)
            counter_item = items[counter_condition]

            full_npi = item["npi"]
            # For multi-phrase NPIs (at all, in years) we are interested in the final token
            true_npi = full_npi.split()[-1]
            sen = " ".join(item["sen"])
            counter_sen = " ".join(counter_item["sen"])

            # Index of start of NPI phrase, to which we add the index of the final token
            start_idx = sen.index(f" {full_npi} ") + 1
            npi_idx = start_idx + sen[start_idx:].index(true_npi)
            start_idx = counter_sen.index(f" {full_npi} ") + 1
            counter_npi_idx = start_idx + counter_sen[start_idx:].index(true_npi)

            monotonicity = "downward" if licensor == 1 else "upward"

            if sen[:npi_idx] + full_npi in sens_seen and skip_duplicate_items:
                continue

            sens_seen.add(sen[:npi_idx] + full_npi)

            corpus.append(
                "\t".join(
                    (
                        str(idx),
                        str(condition),
                        sen[:npi_idx],
                        counter_sen[:counter_npi_idx],
                        true_npi,
                        full_npi,
                        item["env"],
                        monotonicity,
                    )
                )
            )

    if output_path is not None:
        with open(output_path, "w") as f:
            f.write("\n".join(corpus))

    return corpus


if __name__ == "__main__":
    new_corpus = create_downstream_corpus(
        "../../../lm_data/corpora/downstream/warstadt/npi_data_all_environments.tsv",
        output_path="../../../lm_data/corpora/npi/lc_detection_binary_NEW.tsv",
        conditions=[(1, 1, 1), (0, 1, 1)],
        skip_duplicate_items=True,
    )
    print(len(new_corpus))