From 68cc640c36170266cce28a5b537cdef721aa079a Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 14:45:14 +0000 Subject: [PATCH 01/14] Modified initialize_models() method --- initialize.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/initialize.py b/initialize.py index 9b9089545..e9dba4d7b 100644 --- a/initialize.py +++ b/initialize.py @@ -13,15 +13,39 @@ glove = None -def initialize_models(): +def initialize_models(model: str = "spacy", lang: str = "en"): + """ + Initialize heavy models used across transformations/filters + + Parameter: + ---------- + model: str, default is 'spacy' + specify the type of model 'sapcy' or 'glove'. + lang: str, default is 'en' + language. + + Returns: + -------- + None. + """ global spacy_nlp global glove # load spacy - spacy_nlp = spacy.load("en_core_web_sm") - - # load glove - glove = vocab.GloVe(name = "6B", dim = "100") + if model == "spacy": + if lang == "en": + spacy_nlp = spacy.load("en_core_web_sm") + elif lang == "es": + spacy_nlp = spacy.load("es_core_news_sm") + elif lang == "zh": + spacy_nlp = spacy.load("zh_core_web_sm") + elif lang == "de": + spacy_nlp = spacy.load("de_core_news_sm") + elif lang == "fr": + spacy_nlp = spacy.load("fr_core_news_sm") + elif model == "glove": + # load glove + glove = vocab.GloVe(name="6B", dim="100") def reinitialize_spacy(): From 722a2da0f41fbf5ba92a054a350c2f1e6d90930f Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 14:49:38 +0000 Subject: [PATCH 02/14] Updated spacy loading - grapheme_to_phoneme_transformation --- .../transformation.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/transformations/grapheme_to_phoneme_transformation/transformation.py b/transformations/grapheme_to_phoneme_transformation/transformation.py index b35a8b722..3470281dd 100644 --- a/transformations/grapheme_to_phoneme_transformation/transformation.py +++ b/transformations/grapheme_to_phoneme_transformation/transformation.py @@ -1,11 +1,9 @@ import random -import re import string import pronouncing -import spacy -from initialize import spacy_nlp +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType @@ -65,13 +63,18 @@ class PhonemeSubstitution(SentenceOperation): TaskType.TEXT_TO_TEXT_GENERATION, ] languages = ["en"] - keywords = ["lexical", "noise", "noise", "api-based", "aural", "high-precision"] + keywords = [ + "lexical", + "noise", + "noise", + "api-based", + "aural", + "high-precision", + ] def __init__(self, seed=0, prob=0.5, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) - self.spacy_pipeline = ( - spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") - ) + self.spacy_pipeline = spacy_nlp if spacy_nlp else initialize_models() self.prob = prob def generate(self, sentence: str): From de45cca352e7b404508e01892a36e43df388e12d Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 15:07:58 +0000 Subject: [PATCH 03/14] Modified spacy loading - city_names_transformation --- .../transformation.py | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/transformations/city_names_transformation/transformation.py b/transformations/city_names_transformation/transformation.py index 5ec043c60..2e3e2dd47 100644 --- a/transformations/city_names_transformation/transformation.py +++ b/transformations/city_names_transformation/transformation.py @@ -1,12 +1,10 @@ +import hashlib import os -import itertools import random -import spacy -import sys -from tasks.TaskTypes import TaskType + +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation -import hashlib -from initialize import spacy_nlp +from tasks.TaskTypes import TaskType def hash(input: str): @@ -162,15 +160,23 @@ def transform(self, doc, seed=None): class CityNamesTransformation(SentenceOperation): - tasks = [ - TaskType.TEXT_CLASSIFICATION, - TaskType.TEXT_TAGGING - ] + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TAGGING] languages = ["en", "es"] heavy = True - keywords = ["lexical","model-based","tokenizer-required","highly-meaning-preserving","high-precision","low-coverage","high-generations","world-knowledge"] + keywords = [ + "lexical", + "model-based", + "tokenizer-required", + "highly-meaning-preserving", + "high-precision", + "low-coverage", + "high-generations", + "world-knowledge", + ] + # languages the operation can operate on. def __init__(self, seed=0, max_outputs=1, lang="en", data_path=None): + """ Constructor of the CityNamesTransformation object in a given language @@ -195,7 +201,9 @@ def __init__(self, seed=0, max_outputs=1, lang="en", data_path=None): # self.model = spacy.load("en_core_web_sm") # else: # self.model = spacy.load("es_core_news_sm") - self.model = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.model = ( + spacy_nlp if spacy_nlp else initialize_models("spacy", lang) + ) if lang == "en": if data_path is None: self.transformer = ChangeCityNames( From 769b4758d768834b26e25d713263e016a03e0766 Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 15:17:09 +0000 Subject: [PATCH 04/14] Modified spacy loading - synonym_substitution --- transformations/synonym_substitution/transformation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/transformations/synonym_substitution/transformation.py b/transformations/synonym_substitution/transformation.py index d8dd7aec2..075cd853c 100644 --- a/transformations/synonym_substitution/transformation.py +++ b/transformations/synonym_substitution/transformation.py @@ -1,15 +1,13 @@ import re import nltk -import spacy -from initialize import spacy_nlp -from nltk.corpus import wordnet import numpy as np +from nltk.corpus import wordnet +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType - """ Base Class for implementing the different input transformations a generation should be robust against. """ @@ -90,7 +88,9 @@ class SynonymSubstitution(SentenceOperation): def __init__(self, seed=42, prob=0.5, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default self.prob = prob nltk.download("wordnet") From 9289179150820cf0ca709a27a401f434c8b144dc Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 15:28:10 +0000 Subject: [PATCH 05/14] Modified spacy loading - add_hashtags --- .../add_hashtags/transformation.py | 63 +++++++++++++------ 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/transformations/add_hashtags/transformation.py b/transformations/add_hashtags/transformation.py index 140a06e2e..b07f6282d 100644 --- a/transformations/add_hashtags/transformation.py +++ b/transformations/add_hashtags/transformation.py @@ -1,7 +1,8 @@ +import random + +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType -import random -from spacy import load def extract_dep_nodes(dep_parse, be_class_verb): @@ -37,7 +38,7 @@ def generate_hashtag_from_noun_chunk(chunk_list, subj_obj_list): for chunk in chunk_list: if chunk.lower() not in subj_obj_list: chunk_words = [word.title() for word in chunk.split(" ")] - hash_tag_list.append("#"+"".join(chunk_words)) + hash_tag_list.append("#" + "".join(chunk_words)) return hash_tag_list @@ -45,7 +46,7 @@ def extract_noun_chunks_hashtag(dep_parse, subj_obj_list): """Method for extracting noun chunks from dependency parse""" chunk_list = [] for chunk in dep_parse.noun_chunks: - if len(str(chunk.text.split(" ")))>0: + if len(str(chunk.text.split(" "))) > 0: chunk_list.append(chunk.text) return generate_hashtag_from_noun_chunk(chunk_list, subj_obj_list) @@ -56,11 +57,13 @@ def extract_hashtags(sentence, nlp, be_class_verb, subj_obj_list): verb, nsubj, dobj = extract_dep_nodes(dep_parse, be_class_verb) hash_tag_list = [] for dep_n in [verb, nsubj, dobj]: - if(dep_n != ""): - hash_tag_list.append("#"+dep_n) + if dep_n != "": + hash_tag_list.append("#" + dep_n) if verb != "" and dobj != "": - hash_tag_list.append("#"+verb+dobj) - noun_chunks_hashtags = extract_noun_chunks_hashtag(dep_parse, subj_obj_list) + hash_tag_list.append("#" + verb + dobj) + noun_chunks_hashtags = extract_noun_chunks_hashtag( + dep_parse, subj_obj_list + ) if noun_chunks_hashtags is not None: for ht in noun_chunks_hashtags: if ht not in hash_tag_list: @@ -68,9 +71,13 @@ def extract_hashtags(sentence, nlp, be_class_verb, subj_obj_list): return verb, hash_tag_list -def get_hash_tags(sentence, be_class_verb, subj_obj_list, seed=0, max_outputs=1, nlp=None): +def get_hash_tags( + sentence, be_class_verb, subj_obj_list, seed=0, max_outputs=1, nlp=None +): """method for appending hashtags to sentence""" - verb, hashtag_list = extract_hashtags(sentence, nlp, be_class_verb, subj_obj_list) + verb, hashtag_list = extract_hashtags( + sentence, nlp, be_class_verb, subj_obj_list + ) transformation_list = [] for _ in range(max_outputs): random.seed(0) @@ -93,14 +100,30 @@ class HashtagGeneration(SentenceOperation): def __init__(self, seed=0, max_outputs=1): super().__init__(seed) - self.max_outputs=max_outputs - self.nlp = load('en_core_web_sm') - self.be_class_verb = ["is", "am", "are", "was", "were", "will", "shall"] + self.max_outputs = max_outputs + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default + self.be_class_verb = [ + "is", + "am", + "are", + "was", + "were", + "will", + "shall", + ] self.subj_obj_list = ["i", "you", "we", "they", "he", "she"] def generate(self, sentence: str): - transformed_sentences = get_hash_tags(sentence, self.be_class_verb, self.subj_obj_list, - self.seed, self.max_outputs, self.nlp) + transformed_sentences = get_hash_tags( + sentence, + self.be_class_verb, + self.subj_obj_list, + self.seed, + self.max_outputs, + self.nlp, + ) return transformed_sentences @@ -128,8 +151,8 @@ def generate(self, sentence: str): # test_cases[i]["outputs"].append({"sentence":trans_sentence}) # json_file = {"type": convert_to_snake_case("add_hashtags"), "test_cases": test_cases} # print(json.dumps(json_file)) - # for ip in input_sent: - # #random.seed(0) - # print(ip) - # res = tf.generate(ip) - # print(res) +# for ip in input_sent: +# #random.seed(0) +# print(ip) +# res = tf.generate(ip) +# print(res) From c3fe7957035d957bb8793675ac6449335269596f Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 15:58:18 +0000 Subject: [PATCH 06/14] Modified spacy loading - change_person_named_entities --- .../change_person_named_entities/transformation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/transformations/change_person_named_entities/transformation.py b/transformations/change_person_named_entities/transformation.py index c2c510944..46b42342b 100644 --- a/transformations/change_person_named_entities/transformation.py +++ b/transformations/change_person_named_entities/transformation.py @@ -1,7 +1,7 @@ import numpy as np -import spacy from checklist.perturb import Perturb -from initialize import spacy_nlp + +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType @@ -15,7 +15,9 @@ def __init__(self, n=1, seed=0, max_outputs=2): # TODO: Do not repeat parse computations. super().__init__(seed, max_outputs=max_outputs) self.n = n - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default def generate(self, sentence: str): np.random.seed(self.seed) From 370833df0d08b8cf63732031b42190603d4ec63a Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 16:27:04 +0000 Subject: [PATCH 07/14] Modified spacy loading - antonyms_substitute --- .../antonyms_substitute/transformation.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/transformations/antonyms_substitute/transformation.py b/transformations/antonyms_substitute/transformation.py index f3bca2d7e..ea6fd96cc 100644 --- a/transformations/antonyms_substitute/transformation.py +++ b/transformations/antonyms_substitute/transformation.py @@ -1,15 +1,13 @@ import re import nltk -import spacy -from initialize import spacy_nlp -from nltk.corpus import wordnet import numpy as np +from nltk.corpus import wordnet +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType - """ The code is adapted from @zijwang https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/synonym_substitution """ @@ -42,18 +40,19 @@ def untokenize(words): def is_synonyms(word1, word2): synonyms = [] for syn in wordnet.synsets(word1): - for l in syn.lemmas(): - synonyms.append(l.name()) + for lemma in syn.lemmas(): + synonyms.append(lemma.name()) if word2 in synonyms: return True return False + def is_antonyms(word1, word2): antonyms = [] for syn in wordnet.synsets(word1): - for l in syn.lemmas(): - if l.antonyms(): - antonyms.append(l.antonyms()[0].name()) + for lemma in syn.lemmas(): + if lemma.antonyms(): + antonyms.append(lemma.antonyms()[0].name()) if word2 in antonyms: return True return False @@ -83,10 +82,10 @@ def antonyms_substitute(text, spacy_pipeline, seed=22, max_outputs=1): antonyms = [] # synonyms = [] for syn in wordnet.synsets(word, pos=wn_pos): - for l in syn.lemmas(): + for lemma in syn.lemmas(): # synonyms.append(l.name()) - if l.antonyms(): - antonyms.append(l.antonyms()[0].name()) + if lemma.antonyms(): + antonyms.append(lemma.antonyms()[0].name()) antonyms = list(set(antonyms)) if len(antonyms) > 0: @@ -101,7 +100,7 @@ def antonyms_substitute(text, spacy_pipeline, seed=22, max_outputs=1): result = untokenize(result) # choose even number of changes - if counter%2 != 0: + if counter % 2 != 0: result = text # avoid doing transformation that original words are either synonyms or antonyms @@ -113,7 +112,6 @@ def antonyms_substitute(text, spacy_pipeline, seed=22, max_outputs=1): result = text break - if result not in results: # make sure there is no dup in results results.append(result) @@ -135,7 +133,9 @@ class AntonymsSubstitute(SentenceOperation): def __init__(self, seed=42, prob=0.5, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default self.prob = prob nltk.download("wordnet") From a5387ebcc4a57f2bd160f9b82417b9d754eddb9c Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 16:31:31 +0000 Subject: [PATCH 08/14] Modified spacy loading - emojify --- transformations/emojify/transformation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transformations/emojify/transformation.py b/transformations/emojify/transformation.py index 255bb4202..8dd04f610 100644 --- a/transformations/emojify/transformation.py +++ b/transformations/emojify/transformation.py @@ -2,9 +2,7 @@ import random from json import load -import spacy - -from initialize import spacy_nlp +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType @@ -79,7 +77,9 @@ def __init__(self, seed=2022, max_outputs=1): self.word_to_emoji = load(open(dict_path, "r")) # Load the spacy nlp - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default def generate(self, sentence: str): """ From 43e2bcb4ae352d3f0870e3ac4af30346812f72fc Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Wed, 13 Oct 2021 16:43:01 +0000 Subject: [PATCH 09/14] Modified spacy loading - auxiliary_negation_removal --- .../auxiliary_negation_removal/transformation.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/transformations/auxiliary_negation_removal/transformation.py b/transformations/auxiliary_negation_removal/transformation.py index 1c5f04a1d..4fcd139a1 100644 --- a/transformations/auxiliary_negation_removal/transformation.py +++ b/transformations/auxiliary_negation_removal/transformation.py @@ -1,6 +1,4 @@ -import spacy - -from initialize import spacy_nlp +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from interfaces.SentencePairOperation import SentencePairOperation from tasks.TaskTypes import TaskType @@ -120,7 +118,9 @@ class SentenceAuxiliaryNegationRemoval(SentenceOperation): def __init__(self, seed=0, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default def generate(self, sentence: str): @@ -153,7 +153,9 @@ class PairAuxiliaryNegationRemoval(SentencePairOperation): def __init__(self, seed=0, max_outputs=3, pos_label="1", neg_label="0"): super().__init__(seed, max_outputs=max_outputs) - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default self.pos_label = pos_label self.neg_label = neg_label From 5d65e3ce9e410a53853e87851e3f140cab27b31c Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Thu, 14 Oct 2021 13:51:36 +0000 Subject: [PATCH 10/14] Modified spacy loading - correct_common_misspellings --- .../correct_common_misspellings/transformation.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/transformations/correct_common_misspellings/transformation.py b/transformations/correct_common_misspellings/transformation.py index 824cfeddc..00788ed13 100644 --- a/transformations/correct_common_misspellings/transformation.py +++ b/transformations/correct_common_misspellings/transformation.py @@ -1,7 +1,7 @@ -import spacy import json import os -from initialize import spacy_nlp + +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType @@ -25,7 +25,9 @@ class CorrectCommonMisspellings(SentenceOperation): def __init__(self): super().__init__() self.COMMON_MISSPELLINGS_DICT = get_common_misspellings_dict() - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default def generate(self, sentence: str): doc = self.nlp(sentence) @@ -40,7 +42,9 @@ def generate(self, sentence: str): def get_common_misspellings_dict(): spell_corrections = os.path.join( - "transformations", "correct_common_misspellings", "spell_corrections.json" + "transformations", + "correct_common_misspellings", + "spell_corrections.json", ) with open(spell_corrections, "r") as fp: spell_corrections = json.load(fp) From ab18a95443b552806d931f54304ccb7f45b4ba5d Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Sun, 17 Oct 2021 05:44:51 +0000 Subject: [PATCH 11/14] Modified spacy loading - yes_no_question --- transformations/yes_no_question/transformation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/transformations/yes_no_question/transformation.py b/transformations/yes_no_question/transformation.py index a08f193c4..fcd1d750b 100644 --- a/transformations/yes_no_question/transformation.py +++ b/transformations/yes_no_question/transformation.py @@ -1,13 +1,12 @@ from typing import List, Optional, Union import pyinflect # noqa: F401 -import spacy from nltk.tokenize.treebank import TreebankWordDetokenizer from spacy.symbols import AUX, NOUN, PRON, PROPN, VERB, aux, cc, nsubj from spacy.tokens import Span, Token from spacy.tokens.doc import Doc -from initialize import spacy_nlp +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType @@ -90,7 +89,9 @@ class YesNoQuestionPerturbation(SentenceOperation): def __init__(self, seed=0, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) self.detokenizer = TreebankWordDetokenizer() - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default def statement_to_question(self, sentence: Span) -> Union[str, None]: """Given a statement (type: spacy Span), convert to corresponding From 7cb5770fcd1d144f21d1dc719e4f88502c66a824 Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Sun, 17 Oct 2021 05:52:09 +0000 Subject: [PATCH 12/14] Modified spacy loading - subject_object_switch --- .../subject_object_switch/transformation.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/transformations/subject_object_switch/transformation.py b/transformations/subject_object_switch/transformation.py index 4faa9cd84..26c40c6ba 100644 --- a/transformations/subject_object_switch/transformation.py +++ b/transformations/subject_object_switch/transformation.py @@ -1,6 +1,4 @@ -import spacy - -from initialize import spacy_nlp +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from interfaces.SentencePairOperation import SentencePairOperation from tasks.TaskTypes import TaskType @@ -114,7 +112,9 @@ class SentenceSubjectObjectSwitch(SentenceOperation): def __init__(self, seed=0, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) # Initialize the spacy model - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default def generate(self, sentence: str): @@ -148,7 +148,9 @@ class PairSubjectObjectSwitch(SentencePairOperation): def __init__(self, seed=0, max_outputs=3, pos_label="1", neg_label="0"): super().__init__(seed, max_outputs=max_outputs) # Initialize the spacy model - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default self.pos_label = pos_label self.neg_label = neg_label From 0c1ac6ecea90b811ac2d7c2679ad1f30685dbfef Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Sun, 17 Oct 2021 06:04:22 +0000 Subject: [PATCH 13/14] Modified spacy loading - dyslexia_words_swap --- .../dyslexia_words_swap/transformation.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/transformations/dyslexia_words_swap/transformation.py b/transformations/dyslexia_words_swap/transformation.py index 5729acba0..63b6961d8 100644 --- a/transformations/dyslexia_words_swap/transformation.py +++ b/transformations/dyslexia_words_swap/transformation.py @@ -1,9 +1,9 @@ -import os import json -import spacy +import os + +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType -from initialize import spacy_nlp class DyslexiaWordsSwap(SentenceOperation): @@ -13,10 +13,8 @@ class DyslexiaWordsSwap(SentenceOperation): seed: initial seed. Defaults: 0. max_outputs: maximum number of generated outputs. Defaults: 1. """ - tasks = [ - TaskType.TEXT_CLASSIFICATION, - TaskType.TEXT_TO_TEXT_GENERATION - ] + + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] languages = ["en"] keywords = [ "lexical", @@ -26,23 +24,29 @@ class DyslexiaWordsSwap(SentenceOperation): "possible-meaning-alteration", "low-precision", "low-coverage", - "low-generations" + "low-generations", ] def __init__(self, seed=0, max_outputs=1): super().__init__(seed=seed, max_outputs=max_outputs) - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") - with open(os.path.join(os.path.dirname(__file__), 'data.json'), "r") as infile: + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default + with open( + os.path.join(os.path.dirname(__file__), "data.json"), "r" + ) as infile: data = json.load(infile) - self.swap_words = {k: v for dict in data.values() for k, v in dict.items()} + self.swap_words = { + k: v for dict in data.values() for k, v in dict.items() + } self.swap_words_2 = {v: k for k, v in self.swap_words.items()} def generate(self, sentence: str): end_idx = 0 new_sentence = "" for word in self.nlp(sentence): - new_sentence += sentence[end_idx: word.idx] + new_sentence += sentence[end_idx : word.idx] new_word = word.text key = word.text.lower() From 36afafa1c1ee6414b4b75491cea933b2ac5711f2 Mon Sep 17 00:00:00 2001 From: Abinaya Mahendiran Date: Mon, 18 Oct 2021 14:23:33 +0000 Subject: [PATCH 14/14] Modified spacy loading - close_homophones_swap --- transformations/close_homophones_swap/transformation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/transformations/close_homophones_swap/transformation.py b/transformations/close_homophones_swap/transformation.py index d81fe22f2..5a4cdfdfa 100644 --- a/transformations/close_homophones_swap/transformation.py +++ b/transformations/close_homophones_swap/transformation.py @@ -1,9 +1,8 @@ import random -import spacy from SoundsLike.SoundsLike import Search -from initialize import spacy_nlp +from initialize import initialize_models, spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType @@ -60,7 +59,9 @@ class CloseHomophonesSwap(SentenceOperation): def __init__(self, seed=0, max_outputs=1): super().__init__(seed) self.max_outputs = max_outputs - self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.nlp = ( + spacy_nlp if spacy_nlp else initialize_models() + ) # loads en_core_web_sm by default def generate(self, sentence: str): perturbed_texts = close_homophones_swap(