Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize spacy #347

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,39 @@
glove = None


def initialize_models():
def initialize_models(model: str = "spacy", lang: str = "en"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are making it generic, it would be great if we can create an enum for all heavy models which we want to load.
Because in future it may increase.

Something like:

LoadOnceModel.SPACY,
LoadOnceModel.GLOVE

"""
Initialize heavy models used across transformations/filters

Parameter:
----------
model: str, default is 'spacy'
specify the type of model 'sapcy' or 'glove'.
lang: str, default is 'en'
language.

Returns:
--------
None.
"""
global spacy_nlp
global glove

# load spacy
spacy_nlp = spacy.load("en_core_web_sm")

# load glove
glove = vocab.GloVe(name = "6B", dim = "100")
if model == "spacy":
if lang == "en":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cosmetic Change (line 36-45): Better to create a map of 'lang' vs 'spacy model name' which will eliminate multiple lines of code.

spacy_nlp = spacy.load("en_core_web_sm")
elif lang == "es":
spacy_nlp = spacy.load("es_core_news_sm")
elif lang == "zh":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make it more informative, can we add a log message of whatever model we are loading as there are multiple models?
Something like:
"Loading zh_core_web_sm model of spacy......."

spacy_nlp = spacy.load("zh_core_web_sm")
elif lang == "de":
spacy_nlp = spacy.load("de_core_news_sm")
elif lang == "fr":
spacy_nlp = spacy.load("fr_core_news_sm")
elif model == "glove":
# load glove
glove = vocab.GloVe(name="6B", dim="100")

Copy link
Collaborator

@aadesh11 aadesh11 Oct 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should have an 'else' block also where we can throw an exception with an unsupported message. (if it doesn't match any model name)


def reinitialize_spacy():
Expand Down
63 changes: 43 additions & 20 deletions transformations/add_hashtags/transformation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import random

from initialize import initialize_models, spacy_nlp
from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType
import random
from spacy import load


def extract_dep_nodes(dep_parse, be_class_verb):
Expand Down Expand Up @@ -37,15 +38,15 @@ def generate_hashtag_from_noun_chunk(chunk_list, subj_obj_list):
for chunk in chunk_list:
if chunk.lower() not in subj_obj_list:
chunk_words = [word.title() for word in chunk.split(" ")]
hash_tag_list.append("#"+"".join(chunk_words))
hash_tag_list.append("#" + "".join(chunk_words))
return hash_tag_list


def extract_noun_chunks_hashtag(dep_parse, subj_obj_list):
"""Method for extracting noun chunks from dependency parse"""
chunk_list = []
for chunk in dep_parse.noun_chunks:
if len(str(chunk.text.split(" ")))>0:
if len(str(chunk.text.split(" "))) > 0:
chunk_list.append(chunk.text)
return generate_hashtag_from_noun_chunk(chunk_list, subj_obj_list)

Expand All @@ -56,21 +57,27 @@ def extract_hashtags(sentence, nlp, be_class_verb, subj_obj_list):
verb, nsubj, dobj = extract_dep_nodes(dep_parse, be_class_verb)
hash_tag_list = []
for dep_n in [verb, nsubj, dobj]:
if(dep_n != ""):
hash_tag_list.append("#"+dep_n)
if dep_n != "":
hash_tag_list.append("#" + dep_n)
if verb != "" and dobj != "":
hash_tag_list.append("#"+verb+dobj)
noun_chunks_hashtags = extract_noun_chunks_hashtag(dep_parse, subj_obj_list)
hash_tag_list.append("#" + verb + dobj)
noun_chunks_hashtags = extract_noun_chunks_hashtag(
dep_parse, subj_obj_list
)
if noun_chunks_hashtags is not None:
for ht in noun_chunks_hashtags:
if ht not in hash_tag_list:
hash_tag_list.append(ht)
return verb, hash_tag_list


def get_hash_tags(sentence, be_class_verb, subj_obj_list, seed=0, max_outputs=1, nlp=None):
def get_hash_tags(
sentence, be_class_verb, subj_obj_list, seed=0, max_outputs=1, nlp=None
):
"""method for appending hashtags to sentence"""
verb, hashtag_list = extract_hashtags(sentence, nlp, be_class_verb, subj_obj_list)
verb, hashtag_list = extract_hashtags(
sentence, nlp, be_class_verb, subj_obj_list
)
transformation_list = []
for _ in range(max_outputs):
random.seed(0)
Expand All @@ -93,14 +100,30 @@ class HashtagGeneration(SentenceOperation):

def __init__(self, seed=0, max_outputs=1):
super().__init__(seed)
self.max_outputs=max_outputs
self.nlp = load('en_core_web_sm')
self.be_class_verb = ["is", "am", "are", "was", "were", "will", "shall"]
self.max_outputs = max_outputs
self.nlp = (
spacy_nlp if spacy_nlp else initialize_models()
) # loads en_core_web_sm by default
self.be_class_verb = [
"is",
"am",
"are",
"was",
"were",
"will",
"shall",
]
self.subj_obj_list = ["i", "you", "we", "they", "he", "she"]

def generate(self, sentence: str):
transformed_sentences = get_hash_tags(sentence, self.be_class_verb, self.subj_obj_list,
self.seed, self.max_outputs, self.nlp)
transformed_sentences = get_hash_tags(
sentence,
self.be_class_verb,
self.subj_obj_list,
self.seed,
self.max_outputs,
self.nlp,
)
return transformed_sentences


Expand Down Expand Up @@ -128,8 +151,8 @@ def generate(self, sentence: str):
# test_cases[i]["outputs"].append({"sentence":trans_sentence})
# json_file = {"type": convert_to_snake_case("add_hashtags"), "test_cases": test_cases}
# print(json.dumps(json_file))
# for ip in input_sent:
# #random.seed(0)
# print(ip)
# res = tf.generate(ip)
# print(res)
# for ip in input_sent:
# #random.seed(0)
# print(ip)
# res = tf.generate(ip)
# print(res)
30 changes: 15 additions & 15 deletions transformations/antonyms_substitute/transformation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import re

import nltk
import spacy
from initialize import spacy_nlp
from nltk.corpus import wordnet
import numpy as np
from nltk.corpus import wordnet

from initialize import initialize_models, spacy_nlp
from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType


"""
The code is adapted from @zijwang https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/synonym_substitution
"""
Expand Down Expand Up @@ -42,18 +40,19 @@ def untokenize(words):
def is_synonyms(word1, word2):
synonyms = []
for syn in wordnet.synsets(word1):
for l in syn.lemmas():
synonyms.append(l.name())
for lemma in syn.lemmas():
synonyms.append(lemma.name())
if word2 in synonyms:
return True
return False


def is_antonyms(word1, word2):
antonyms = []
for syn in wordnet.synsets(word1):
for l in syn.lemmas():
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
for lemma in syn.lemmas():
if lemma.antonyms():
antonyms.append(lemma.antonyms()[0].name())
if word2 in antonyms:
return True
return False
Expand Down Expand Up @@ -83,10 +82,10 @@ def antonyms_substitute(text, spacy_pipeline, seed=22, max_outputs=1):
antonyms = []
# synonyms = []
for syn in wordnet.synsets(word, pos=wn_pos):
for l in syn.lemmas():
for lemma in syn.lemmas():
# synonyms.append(l.name())
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
if lemma.antonyms():
antonyms.append(lemma.antonyms()[0].name())
antonyms = list(set(antonyms))

if len(antonyms) > 0:
Expand All @@ -101,7 +100,7 @@ def antonyms_substitute(text, spacy_pipeline, seed=22, max_outputs=1):
result = untokenize(result)

# choose even number of changes
if counter%2 != 0:
if counter % 2 != 0:
result = text

# avoid doing transformation that original words are either synonyms or antonyms
Expand All @@ -113,7 +112,6 @@ def antonyms_substitute(text, spacy_pipeline, seed=22, max_outputs=1):
result = text
break


if result not in results:
# make sure there is no dup in results
results.append(result)
Expand All @@ -135,7 +133,9 @@ class AntonymsSubstitute(SentenceOperation):

def __init__(self, seed=42, prob=0.5, max_outputs=1):
super().__init__(seed, max_outputs=max_outputs)
self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm")
self.nlp = (
spacy_nlp if spacy_nlp else initialize_models()
) # loads en_core_web_sm by default
self.prob = prob
nltk.download("wordnet")

Expand Down
12 changes: 7 additions & 5 deletions transformations/auxiliary_negation_removal/transformation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import spacy

from initialize import spacy_nlp
from initialize import initialize_models, spacy_nlp
from interfaces.SentenceOperation import SentenceOperation
from interfaces.SentencePairOperation import SentencePairOperation
from tasks.TaskTypes import TaskType
Expand Down Expand Up @@ -120,7 +118,9 @@ class SentenceAuxiliaryNegationRemoval(SentenceOperation):

def __init__(self, seed=0, max_outputs=1):
super().__init__(seed, max_outputs=max_outputs)
self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm")
self.nlp = (
spacy_nlp if spacy_nlp else initialize_models()
) # loads en_core_web_sm by default

def generate(self, sentence: str):

Expand Down Expand Up @@ -153,7 +153,9 @@ class PairAuxiliaryNegationRemoval(SentencePairOperation):

def __init__(self, seed=0, max_outputs=3, pos_label="1", neg_label="0"):
super().__init__(seed, max_outputs=max_outputs)
self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm")
self.nlp = (
spacy_nlp if spacy_nlp else initialize_models()
) # loads en_core_web_sm by default
self.pos_label = pos_label
self.neg_label = neg_label

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import spacy
from checklist.perturb import Perturb
from initialize import spacy_nlp

from initialize import initialize_models, spacy_nlp
from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType

Expand All @@ -15,7 +15,9 @@ def __init__(self, n=1, seed=0, max_outputs=2):
# TODO: Do not repeat parse computations.
super().__init__(seed, max_outputs=max_outputs)
self.n = n
self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm")
self.nlp = (
spacy_nlp if spacy_nlp else initialize_models()
) # loads en_core_web_sm by default

def generate(self, sentence: str):
np.random.seed(self.seed)
Expand Down
32 changes: 20 additions & 12 deletions transformations/city_names_transformation/transformation.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import hashlib
import os
import itertools
import random
import spacy
import sys
from tasks.TaskTypes import TaskType

from initialize import initialize_models, spacy_nlp
from interfaces.SentenceOperation import SentenceOperation
import hashlib
from initialize import spacy_nlp
from tasks.TaskTypes import TaskType


def hash(input: str):
Expand Down Expand Up @@ -162,15 +160,23 @@ def transform(self, doc, seed=None):


class CityNamesTransformation(SentenceOperation):
tasks = [
TaskType.TEXT_CLASSIFICATION,
TaskType.TEXT_TAGGING
]
tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TAGGING]
languages = ["en", "es"]
heavy = True
keywords = ["lexical","model-based","tokenizer-required","highly-meaning-preserving","high-precision","low-coverage","high-generations","world-knowledge"]
keywords = [
"lexical",
"model-based",
"tokenizer-required",
"highly-meaning-preserving",
"high-precision",
"low-coverage",
"high-generations",
"world-knowledge",
]

# languages the operation can operate on.
def __init__(self, seed=0, max_outputs=1, lang="en", data_path=None):

"""
Constructor of the CityNamesTransformation object in a given language

Expand All @@ -195,7 +201,9 @@ def __init__(self, seed=0, max_outputs=1, lang="en", data_path=None):
# self.model = spacy.load("en_core_web_sm")
# else:
# self.model = spacy.load("es_core_news_sm")
self.model = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm")
self.model = (
spacy_nlp if spacy_nlp else initialize_models("spacy", lang)
)
if lang == "en":
if data_path is None:
self.transformer = ChangeCityNames(
Expand Down
7 changes: 4 additions & 3 deletions transformations/close_homophones_swap/transformation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import random

import spacy
from SoundsLike.SoundsLike import Search

from initialize import spacy_nlp
from initialize import initialize_models, spacy_nlp
from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType

Expand Down Expand Up @@ -60,7 +59,9 @@ class CloseHomophonesSwap(SentenceOperation):
def __init__(self, seed=0, max_outputs=1):
super().__init__(seed)
self.max_outputs = max_outputs
self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm")
self.nlp = (
spacy_nlp if spacy_nlp else initialize_models()
) # loads en_core_web_sm by default

def generate(self, sentence: str):
perturbed_texts = close_homophones_swap(
Expand Down
Loading