Skip to content

Commit

Permalink
Set up the Levin, LIWC, and General Inquirer files
Browse files Browse the repository at this point in the history
  • Loading branch information
bryant1410 committed Jun 18, 2023
1 parent 3e7b3bc commit 7144cff
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 14 deletions.
15 changes: 11 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,17 @@ above) for an explanation of the results.
We provide [the output of this command ready to
download](https://huggingface.co/datasets/MichiganNLP/scalable_vlm_probing/blob/main/words_counter_LAION.json).
6. TODO: how to obtain Levin.
7. Obtain LIWC 2015. See [LIWC website](https://www.liwc.app/) for more information.
8. TODO: how to obtain the General Inquirer.
9. Run the following to obtain the resulting correlation scores and save them as files:
6. Obtain LIWC 2015. See [LIWC website](https://www.liwc.app/) for more information. Set the path or URL of the file
`LIWC.2015.all.txt` in the environment variable `LIWC_URL_OR_PATH`:
```bash
export LIWC_URL_OR_PATH=...
```
You can also disable the LIWC features by using the flag `--remove-features` along with other features, such as the
default removed ones: `--remove-features LIWC wup-similarity lch-similarity path-similarity`.
7. Run the following to obtain the resulting correlation scores and save them as files:
```bash
./main.py --dependent-variable-name pos_clip_score --no-neg-features > results/pos_scores.txt
Expand Down
29 changes: 20 additions & 9 deletions features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import ast
import itertools
import json
import os
import string
import warnings
from collections import Counter, defaultdict
Expand All @@ -12,6 +13,7 @@
import numpy as np
import pandas as pd
import statsmodels.api as sm
from cached_path import cached_path
from datasets import load_dataset
from huggingface_hub import snapshot_download
from nltk.corpus import wordnet as wn
Expand Down Expand Up @@ -40,14 +42,15 @@
VALID_NEG_TYPES = get_args(NegType)
VALID_LEVIN_RETURN_MODES = get_args(LevinReturnMode)

PATH_DATA_FOLDER = Path(snapshot_download("MichiganNLP/probing-clip", repo_type="dataset"))

PATH_LEVIN_VERBS = PATH_DATA_FOLDER / "levin_verbs.txt"
PATH_LEVIN_SEMANTIC_BROAD = PATH_DATA_FOLDER / "levin_semantic_broad.json"
PATH_LIWC = PATH_DATA_FOLDER / "LIWC.2015.all.txt"
PATH_GENERAL_INQ = PATH_DATA_FOLDER / "inquirer_augmented.xls"
PATH_DATA_FOLDER = Path(snapshot_download("MichiganNLP/scalable_vlm_probing", repo_type="dataset"))
PATH_WORD_FREQUENCIES = PATH_DATA_FOLDER / "words_counter_LAION.json"

PATH_LEVIN_VERBS = cached_path("https://huggingface.co/datasets/MichiganNLP/levin_verbs/raw/main/levin_verbs.txt")
PATH_LEVIN_SEMANTIC_BROAD = cached_path(
"https://huggingface.co/datasets/MichiganNLP/levin_verbs/raw/main/levin_semantic_broad.json")

PATH_GENERAL_INQ = cached_path("https://inquirer.sites.fas.harvard.edu/inquireraugmented.xls")

text_model = SentenceTransformer("all-MiniLM-L6-v2")

stemmer = PorterStemmer()
Expand Down Expand Up @@ -239,11 +242,19 @@ def _get_frequency(word: str, word_frequencies: Mapping[str, int]) -> int:
return word_frequencies.get(word, 0)


def _parse_liwc_file(path: FilePath = PATH_LIWC, verbose: bool = True) -> Mapping[str, Sequence[str]]:
def _parse_liwc_file(verbose: bool = True) -> Mapping[str, Sequence[str]]:
dict_liwc = defaultdict(list)
liwc_categories = set()

with open(path) as file:
url_or_path = os.environ.get("LIWC_URL_OR_PATH")

if not url_or_path:
raise ValueError("The environment variable LIWC_URL_OR_PATH must be set to use LIWC."
" To disable LIWC, use the flag `--remove-features`, also including any other feature that you"
" want to remove. To remove LIWC along with the default removed features, use:"
" `--remove-features LIWC wup-similarity lch-similarity path-similarity`.")

with open(cached_path(url_or_path)) as file:
for line in file:
word, category = (w.strip() for w in line.strip().split(","))
dict_liwc[word].append(category)
Expand Down Expand Up @@ -352,7 +363,7 @@ def _compute_features(clip_results: pd.DataFrame, feature_deny_list: Collection[
# We use the underscore to separate a feature name from its value if it's binarized.
df = df.rename(columns={"neg_type": "neg-type"})

if "number of words" not in feature_deny_list:
if "number-of-words" not in feature_deny_list:
df["number of words"] = df.sentence.str.split().str.len()

if compute_neg_features:
Expand Down
5 changes: 4 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,10 @@ def parse_args() -> argparse.Namespace:

parser.add_argument("--dependent-variable-name")
parser.add_argument("-r", "--remove-features", dest="feature_deny_list", nargs="+",
default={"wup-similarity", "lch-similarity", "path-similarity"})
default={"wup-similarity", "lch-similarity", "path-similarity"},
choices={"concreteness", "frequency", "GeneralINQ", "hypernym", "hypernym/indirect",
"lch-similarity", "Levin", "LIWC", "nb-synsets", "number-of-words", "path-similarity",
"spacy", "text-similarity", "word-similarity", "wup-similarity"})
parser.add_argument("--min-non-most-frequent-values", type=int, default=100,
help="The minimum number of values that have to be different from the most frequent one.")
parser.add_argument("--no-neg-features", dest="compute_neg_features", action="store_false")
Expand Down

0 comments on commit 7144cff

Please sign in to comment.