Namespace stylotool
+Sub-modules
+-
+
stylotool.setup
+- + + +
stylotool.src
+- + + +
stylotool.test
+- + + +
diff --git a/html/stylotool/index.html b/html/stylotool/index.html new file mode 100644 index 0000000..85ff16d --- /dev/null +++ b/html/stylotool/index.html @@ -0,0 +1,72 @@ + + +
+ + + +stylotool
stylotool.setup
stylotool.src
stylotool.test
stylotool.setup
stylotool.src.freestylo.AlliterationAnnotation
+class AlliterationAnnotation
+(text: freestylo.TextObject.TextObject, max_skip=2, min_length=3, skip_tokens=['.', ',', ':', ';', '!', '?', '…', '(', ')', '[', ']', '{', '}', '„', '“', '‚', '‘:', '‘', '’'])
+
This class is used to find alliterations candidates in a text. +It uses the TextObject class to store the text and its annotations.
+text
: TextObject
max_skip
: int
, optionalmin_length
: int
, optionalskip_tokens
: list
, optionalclass AlliterationAnnotation:
+ """ This class is used to find alliterations candidates in a text.
+ It uses the TextObject class to store the text and its annotations.
+ """
+
+ def __init__(self, text : TextObject, max_skip = 2, min_length=3, skip_tokens=[".", ",", ":", ";", "!", "?", "…", "(", ")", "[", "]", "{", "}", "„", "“", "‚", "‘:", "‘", "’"]):
+ """
+ Parameters
+ ----------
+ text : TextObject
+ The text to be analyzed.
+ max_skip : int, optional
+ min_length : int, optional
+ skip_tokens : list, optional
+ A list of tokens that should be skipped when looking for alliterations.
+ """
+
+ self.text = text
+ self.candidates = []
+ self.max_skip = max_skip
+ self.min_length = min_length
+ self.skip_tokens = skip_tokens
+
+
+ def find_candidates(self):
+ """
+ This method finds alliteration candidates in the text.
+ """
+ tokens = self.text.tokens
+
+ open_candidates = {}
+ i = 0
+
+ for i in range(len(tokens)):
+ token = tokens[i]
+ token_char = token[0].lower()
+ # check if there is an alliteration candidate with the current character
+ if not token_char.isalpha():
+ continue
+ # if not, create a new one
+ if token_char not in open_candidates:
+ open_candidates[token_char] = [AlliterationCandidate([i], token_char), 0]
+ continue
+ # if yes, add the current token to the candidate
+ candidate = open_candidates[token_char][0]
+ candidate.ids.append(i)
+
+ # close candidates
+ keys_to_delete = []
+ for key in open_candidates:
+ candidate_pair = open_candidates[key]
+ candidate = candidate_pair[0]
+ if token_char in self.skip_tokens:
+ candidate_pair[1] += 1
+ if i - candidate.ids[-1] >= self.max_skip+1+candidate_pair[1]:
+ if len(candidate.ids) > self.min_length:
+ self.candidates.append(candidate)
+ keys_to_delete.append(key)
+ for key_del in keys_to_delete:
+ del open_candidates[key_del]
+ # get the remaining ones
+ for key in open_candidates:
+ candidate = open_candidates[key][0]
+ if len(candidate.ids) > self.min_length:
+ self.candidates.append(candidate)
+
+
+
+ def serialize(self) -> list:
+ """
+ This method serializes the alliteration candidates into a list of dictionaries.
+
+ Returns
+ -------
+ list
+ A list of dictionaries containing the ids, length and character of the alliteration candidates.
+ """
+ candidates = []
+ for c in self.candidates:
+ candidates.append({
+ "ids": c.ids,
+ "length": c.length,
+ "char": c.char})
+ return candidates
+
+def find_candidates(self)
+
This method finds alliteration candidates in the text.
+def serialize(self) ‑> list
+
This method serializes the alliteration candidates into a list of dictionaries.
+list
+class AlliterationCandidate
+(ids, char)
+
This class represents an alliteration candidate.
+ids
: list
char
: str
class AlliterationCandidate():
+ """
+ This class represents an alliteration candidate.
+ """
+ def __init__(self, ids, char):
+ """
+ Parameters
+ ----------
+ ids : list
+ A list of token ids that form the alliteration candidate.
+ char : str
+ The character that the candidate starts with.
+ """
+ self.ids = ids
+ self.char = char
+
+ @property
+ def score(self):
+ """
+ This property returns the score of the alliteration candidate.
+ """
+ return len(self.ids)
+
+ @property
+ def length(self):
+ """
+ This property returns the length of the alliteration candidate.
+ """
+ return len(self.ids)
+prop length
This property returns the length of the alliteration candidate.
@property
+def length(self):
+ """
+ This property returns the length of the alliteration candidate.
+ """
+ return len(self.ids)
+prop score
This property returns the score of the alliteration candidate.
@property
+def score(self):
+ """
+ This property returns the score of the alliteration candidate.
+ """
+ return len(self.ids)
+stylotool.src.freestylo.ChiasmusAnnotation
+def cosine_similarity(vec1, vec2)
+
This method calculates the cosine similarity between two vectors.
+vec1
: np.array
vec2
: np.array
+class ChiasmusAnnotation
+(text: freestylo.TextObject.TextObject, window_size=30)
+
This class is used to find chiasmus candidates in a text. +It uses the TextObject class to store the text and its annotations.
+text
: TextObject
window_size
: int
, optionalclass ChiasmusAnnotation:
+ """
+ This class is used to find chiasmus candidates in a text.
+ It uses the TextObject class to store the text and its annotations.
+ """
+ def __init__(self, text : TextObject, window_size=30):
+ """
+ Parameters
+ ----------
+ text : TextObject
+ The text to be analyzed.
+ window_size : int, optional
+ The window size to search for chiasmus candidates
+ """
+ self.text = text
+ text.annotations.append(self)
+ self.window_size = window_size
+ self.candidates = []
+ self.denylist = []
+ self.allowlist = []
+ self.neglist = []
+ self.poslist = []
+ self.conjlist = []
+ self.type = "chiasmus"
+ self.model = None
+
+
+ def find_candidates(self):
+ """
+ This method finds chiasmus candidates in the text.
+ It uses the window_size to search for candidates.
+ """
+ pos = self.text.pos
+
+ outer_matches = []
+ for i in range(len(pos)):
+ outer_matches += self._find_matches(i, i + self.window_size)
+
+ for match in outer_matches:
+ A, A_ = match
+ start_inner = A + 1
+ inner_matches = self._find_matches(start_inner, A_)
+ for B, B_ in inner_matches:
+ self.candidates.append(ChiasmusCandidate(A, B, B_, A_))
+
+ def load_classification_model(self, model_path):
+ """
+ This method loads a classification model to score the chiasmus candidates.
+ Parameters
+ ----------
+ model_path : str
+ The path to the model file.
+ """
+ import pickle
+ with open(get_model_path(model_path), "rb") as f:
+ self.model = pickle.load(f)
+
+ def serialize(self) -> list:
+ """
+ This method serializes the chiasmus candidates.
+
+ Returns
+ -------
+ list
+ A list of serialized candidates.
+ """
+ candidates = []
+ for c in self.candidates:
+ candidates.append({
+ "ids": c.ids,
+ "A": c.A,
+ "B": c.B,
+ "B_": c.B_,
+ "A_": c.A_,
+ "score": c.score})
+ return candidates
+
+
+
+
+ def _find_matches(self, start : int, end : int) -> list:
+ """
+ This method finds matches in the pos list of the text.
+ It uses the start and end index to search for matches.
+
+ Parameters
+ ----------
+ start : int
+ The start index of the search.
+ end : int
+ The end index of the search.
+ """
+ pos = self.text.pos
+
+ #if end > len(pos):
+ # end = len(pos)
+
+ #if end < start+3:
+ # return []
+
+ if not self._check_pos(pos[start]):
+ return []
+ matches = []
+ for i in range(start+1, end):
+ try:
+ if pos[start] == pos[i]:
+ matches.append((start, i))
+ except IndexError:
+ pass
+ return matches
+
+ def _check_pos(self, pos):
+ """
+ This method checks if a pos is in the allowlist or not in the denylist.
+
+ Parameters
+ ----------
+ pos : str
+ The pos to check.
+ """
+ if len(self.allowlist) > 0 and pos not in self.allowlist:
+ return False
+ if len(self.denylist) > 0 and pos in self.denylist:
+ return False
+ return True
+
+ def has_candidates(self):
+ """
+ This method checks if the text has chiasmus candidates.
+ """
+ return len(self.candidates) > 0
+
+ def score_candidates(self):
+ """
+ This method scores the chiasmus candidates.
+ """
+ features = []
+ for candidate in self.candidates:
+ features.append(self.get_features(candidate))
+ if self.model is None:
+ print("Load Chiasmus Model before scoring the candidates")
+ return False
+ features = np.stack(features)
+ scores = self.model.decision_function(features)
+ for score, candidate in zip(scores, self.candidates):
+ candidate.score = score
+ return True
+
+ def get_features(self, candidate):
+ """
+ This method extracts features for a chiasmus candidate.
+
+ Parameters
+ ----------
+ candidate : ChiasmusCandidate
+ The candidate to extract features from.
+
+ Returns
+ -------
+ np.array
+ An array of features.
+ """
+
+ dubremetz_features = self.get_dubremetz_features(candidate)
+ lexical_features = self.get_lexical_features(candidate)
+ semantic_features = self.get_semantic_features(candidate)
+ return np.concatenate((dubremetz_features, lexical_features, semantic_features))
+
+ def get_dubremetz_features(self, candidate):
+ """
+ This method extracts Dubremetz features for a chiasmus candidate.
+
+ Returns
+ -------
+ np.array
+ An array of Dubremetz features
+ """
+ tokens = self.text.tokens
+ lemmas = self.text.lemmas
+ pos = self.text.pos
+ dep = self.text.dep
+ vectors = self.text.vectors
+
+ context_start = candidate.A - 5
+ context_end = candidate.A_ + 5
+
+ tokens_main = [tokens[i] for i in range(candidate.A, candidate.A_+1)]
+ lemmas_main = [lemmas[i] for i in range(candidate.A, candidate.A_+1)]
+ pos_main = [pos[i] for i in range(candidate.A, candidate.A_+1)]
+ dep_main = [dep[i] for i in range(candidate.A, candidate.A_+1)]
+ vectors_main = [vectors[i] for i in range(candidate.A, candidate.A_+1)]
+
+ neglist = self.neglist
+ poslist = self.poslist
+ conjlist = self.conjlist
+
+ hardp_list = ['.', '(', ')', "[", "]"]
+ softp_list = [',', ';']
+
+ features = []
+
+ # Basic
+
+ num_punct = 0
+ for h in hardp_list:
+ if h in tokens[ candidate.ids[0]+1 : candidate.ids[1] ]: num_punct+=1
+ if h in tokens[ candidate.ids[2]+1 : candidate.ids[3] ]: num_punct+=1
+ features.append(num_punct)
+
+ num_punct = 0
+ for h in hardp_list:
+ if h in tokens[ candidate.ids[0]+1 : candidate.ids[1] ]: num_punct+=1
+ if h in tokens[ candidate.ids[2]+1 : candidate.ids[3] ]: num_punct+=1
+ features.append(num_punct)
+
+ num_punct = 0
+ for h in hardp_list:
+ if h in tokens[ candidate.ids[1]+1 : candidate.ids[2] ]: num_punct+=1
+ features.append(num_punct)
+
+ rep_a1 = -1
+ if lemmas[candidate.ids[0]] == lemmas[candidate.ids[3]]:
+ rep_a1 -= 1
+ rep_a1 += lemmas.count(lemmas[candidate.ids[0]])
+ features.append(rep_a1)
+
+ rep_b1 = -1
+ if lemmas[candidate.ids[1]] == lemmas[candidate.ids[2]]:
+ rep_b1 -= 1
+ rep_b1 += lemmas.count(lemmas[candidate.ids[1]])
+ features.append(rep_b1)
+
+ rep_b2 = -1
+ if lemmas[candidate.ids[1]] == lemmas[candidate.ids[2]]:
+ rep_b2 -= 1
+ rep_b2 += lemmas.count(lemmas[candidate.ids[2]])
+ features.append(rep_b2)
+
+ rep_a2 = -1
+ if lemmas[candidate.ids[0]] == lemmas[candidate.ids[3]]:
+ rep_a2 -= 1
+ rep_a2 += lemmas.count(lemmas[candidate.ids[3]])
+ features.append(rep_b2)
+
+ # Size
+
+ diff_size = abs((candidate.ids[1]-candidate.ids[0]) - (candidate.ids[3]-candidate.ids[2]))
+ features.append(diff_size)
+
+ toks_in_bc = candidate.ids[3]-candidate.ids[1]
+ features.append(toks_in_bc)
+
+ # Similarity
+
+ exact_match = ([" ".join(tokens[candidate.ids[0]+1 : candidate.ids[1]])] == [" ".join(tokens[candidate.ids[2]+1 : candidate.ids[3]])])
+ features.append(exact_match)
+
+ same_tok = 0
+ for l in lemmas[candidate.ids[0]+1 : candidate.ids[1]]:
+ if l in lemmas[candidate.ids[2]+1 : candidate.ids[3]]: same_tok += 1
+ features.append(same_tok)
+
+ sim_score = same_tok / (candidate.ids[1]-candidate.ids[0])
+ features.append(sim_score)
+
+ num_bigrams = 0
+ t1 = " ".join(tokens[candidate.ids[0]+1 : candidate.ids[1]])
+ t2 = " ".join(tokens[candidate.ids[2]+1 : candidate.ids[3]])
+ s1 = set()
+ s2 = set()
+ for t in range(len(t1)-1):
+ bigram = t1[t:t+2]
+ s1.add(bigram)
+ for t in range(len(t2)-1):
+ bigram = t2[t:t+2]
+ s2.add(bigram)
+ for b in s1:
+ if b in s2: num_bigrams += 1
+ bigrams_normed = (num_bigrams/max(len(s1)+1, len(s2)+1))
+ features.append(bigrams_normed)
+
+ num_trigrams = 0
+ t1 = " ".join(tokens[candidate.ids[0]+1 : candidate.ids[1]])
+ t2 = " ".join(tokens[candidate.ids[2]+1 : candidate.ids[3]])
+ s1 = set()
+ s2 = set()
+ for t in range(len(t1)-2):
+ trigram = t1[t:t+3]
+ s1.add(trigram)
+ for t in range(len(t2)-2):
+ trigram = t2[t:t+3]
+ s2.add(trigram)
+ for t in s1:
+ if t in s2: num_trigrams += 1
+ trigrams_normed = (num_trigrams/max(len(s1)+1, len(s2)+1))
+ features.append(trigrams_normed)
+
+ same_cont = 0
+ t1 = set(tokens[candidate.ids[0]+1:candidate.ids[1]])
+ t2 = set(tokens[candidate.ids[2]+1:candidate.ids[3]])
+ for t in t1:
+ if t in t2: same_cont += 1
+ features.append(same_cont)
+
+ # Lexical clues
+
+ conj = 0
+ for c in conjlist:
+ if c in tokens[candidate.ids[1]+1:candidate.ids[2]]+lemmas[candidate.ids[1]+1:candidate.ids[2]]:
+ conj = 1
+ features.append(conj)
+
+
+ neg = 0
+ for n in neglist:
+ if n in tokens[candidate.ids[1]+1:candidate.ids[2]]+lemmas[candidate.ids[1]+1:candidate.ids[2]]:
+ neg = 1
+ features.append(neg)
+
+
+ # Dependency score
+
+ if dep[candidate.ids[1]] == dep[candidate.ids[3]]:
+ features.append(1)
+ else:
+ features.append(0)
+
+ if dep[candidate.ids[0]] == dep[candidate.ids[2]]:
+ features.append(1)
+ else:
+ features.append(0)
+
+ if dep[candidate.ids[1]] == dep[candidate.ids[2]]:
+ features.append(1)
+ else:
+ features.append(0)
+
+ if dep[candidate.ids[0]] == dep[candidate.ids[3]]:
+ features.append(1)
+ else:
+ features.append(0)
+
+ features = np.array(features)
+ return features
+
+ def get_lexical_features(self, candidate):
+ """
+ This method extracts lexical features for a chiasmus candidate.
+
+ Returns
+ -------
+ np.array
+ An array of lexical features
+ """
+ tokens = self.text.tokens
+ lemmas = self.text.lemmas
+ pos = self.text.pos
+ dep = self.text.dep
+ vectors = self.text.vectors
+
+ context_start = candidate.A - 5
+ context_end = candidate.A_ + 5
+
+ lemmas_main = [lemmas[i] for i in candidate.ids]
+
+
+ neglist = self.neglist
+ poslist = self.poslist
+
+ features = []
+
+
+ for i in range(len(lemmas_main)):
+ for j in range(i+1, len(lemmas_main)):
+ if lemmas_main[i] == lemmas_main[j]:
+ features.append(1)
+ else:
+ features.append(0)
+
+ features = np.array(features)
+ return features
+
+ def get_semantic_features(self, candidate):
+ """
+ This method extracts semantic features for a chiasmus candidate.
+
+ Returns
+ -------
+ np.array
+ An array of semantic features
+ """
+ tokens = self.text.tokens
+ lemmas = self.text.lemmas
+ pos = self.text.pos
+ dep = self.text.dep
+ vectors = self.text.vectors
+
+ context_start = candidate.A - 5
+ context_end = candidate.A_ + 5
+
+ vectors_main = [vectors[i] for i in candidate.ids]
+
+
+ features = []
+ for i in range(len(vectors_main)):
+ for j in range(i+1, len(vectors_main)):
+ features.append(cosine_similarity(vectors_main[i], vectors_main[j]))
+
+ features = np.array(features)
+ return features
+
+def find_candidates(self)
+
This method finds chiasmus candidates in the text. +It uses the window_size to search for candidates.
+def get_dubremetz_features(self, candidate)
+
This method extracts Dubremetz features for a chiasmus candidate.
+np.array
+def get_features(self, candidate)
+
This method extracts features for a chiasmus candidate.
+candidate
: ChiasmusCandidate
np.array
+def get_lexical_features(self, candidate)
+
This method extracts lexical features for a chiasmus candidate.
+np.array
+def get_semantic_features(self, candidate)
+
This method extracts semantic features for a chiasmus candidate.
+np.array
+def has_candidates(self)
+
This method checks if the text has chiasmus candidates.
+def load_classification_model(self, model_path)
+
This method loads a classification model to score the chiasmus candidates. +Parameters
+model_path
: str
+def score_candidates(self)
+
This method scores the chiasmus candidates.
+def serialize(self) ‑> list
+
This method serializes the chiasmus candidates.
+list
+class ChiasmusCandidate
+(A, B, B_, A_)
+
This class represents a chiasmus candidate.
+A
: int
B
: int
B_
: int
A_
: int
class ChiasmusCandidate:
+ """
+ This class represents a chiasmus candidate.
+ """
+ def __init__(self, A, B, B_, A_):
+ """
+ Parameters
+ ----------
+ A : int
+ Index of the first supporting word
+ B : int
+ Index of the second supporting word
+ B_ : int
+ Index of the third supporting word, paired with B
+ A_ : int
+ Index of the fourth supporting word, paired with A
+ """
+
+ self.ids = [A, B, B_, A_]
+ self.A = A
+ self.B = B
+ self.B_ = B_
+ self.A_ = A_
+ self.score = None
+
+ def __str__(self):
+ """
+ This method returns a string representation of the chiasmus candidate.
+ """
+ return f"{self.A} {self.B} {self.B_} {self.A_}"
+stylotool.src.freestylo.Configs
+def get_model_path(model_to_load: str) ‑> str
+
stylotool.src.freestylo.EpiphoraAnnotation
+class EpiphoraAnnotation
+(text: freestylo.TextObject.TextObject, min_length=2, conj=['and', 'or', 'but', 'nor'], punct_pos='PUNCT')
+
This class is used to find epiphora candidates in a text. +It uses the TextObject class to store the text and its annotations.
+Constructor for the EpiphoraAnnotation class.
+text
: TextObject
min_length
: int
, optionalconj
: list
, optionalpunct_pos
: str
, optionalclass EpiphoraAnnotation:
+ """
+ This class is used to find epiphora candidates in a text.
+ It uses the TextObject class to store the text and its annotations.
+ """
+ def __init__(self, text : TextObject, min_length=2, conj = ["and", "or", "but", "nor"], punct_pos="PUNCT"):
+ """
+ Constructor for the EpiphoraAnnotation class.
+
+ Parameters
+ ----------
+ text : TextObject
+ The text to be analyzed.
+ min_length : int, optional
+ The minimum length of the epiphora candidates.
+ conj : list, optional
+ A list of conjunctions that should be considered when looking for epiphora.
+ punct_pos : str, optional
+ The part of speech tag for punctuation.
+ """
+
+ self.text = text
+ self.candidates = []
+ self.min_length = min_length
+ self.conj = conj
+ self.punct_pos = punct_pos
+
+ def split_in_phrases(self):
+ """
+ This method splits the text into phrases.
+
+ Returns
+ -------
+ list
+ A list of lists, each containing the start and end index of a phrase.
+ """
+
+ phrases = []
+ current_start = 0
+ for i, token in enumerate(self.text.tokens):
+ if token in self.conj or self.text.pos[i] == self.punct_pos:
+ if i-current_start > 2:
+ phrases.append([current_start, i])
+ current_start = i+1
+ phrases.append([current_start, len(self.text.tokens)])
+ return phrases
+
+
+ def find_candidates(self):
+ """
+ This method finds epiphora candidates in the text.
+ """
+ candidates = []
+ current_candidate = EpiphoraCandidate([], "")
+ phrases = self.split_in_phrases()
+ for phrase in phrases:
+ word = self.text.tokens[phrase[1]-1]
+ if word != current_candidate.word:
+ if len(current_candidate.ids) >= self.min_length:
+ candidates.append(current_candidate)
+ current_candidate = EpiphoraCandidate([phrase], word)
+ else:
+ current_candidate.ids.append(phrase)
+ self.candidates = candidates
+
+ def serialize(self) -> list:
+ """
+ This method serializes the epiphora candidates.
+
+ Returns
+ -------
+ list
+ A list of dictionaries, each containing the ids, length, and word of an epiphora candidate.
+ """
+ candidates = []
+ for c in self.candidates:
+ candidates.append({
+ "ids": c.ids,
+ "length": c.length,
+ "word": c.word})
+ return candidates
+
+def find_candidates(self)
+
This method finds epiphora candidates in the text.
+def serialize(self) ‑> list
+
This method serializes the epiphora candidates.
+list
+def split_in_phrases(self)
+
This method splits the text into phrases.
+list
+class EpiphoraCandidate
+(ids, word)
+
This class represents an epiphora candidate.
+Constructor for the EpiphoraCandidate class.
+ids
: list
word
: str
class EpiphoraCandidate():
+ """
+ This class represents an epiphora candidate.
+ """
+ def __init__(self, ids, word):
+ """
+ Constructor for the EpiphoraCandidate class.
+
+ Parameters
+ ----------
+ ids : list
+ A list of token ids that form the candidate.
+ word : str
+ The word that the candidate ends with.
+ """
+ self.ids = ids
+ self.word = word
+
+ @property
+ def score(self):
+ """
+ This property returns the score of the candidate.
+ """
+ return len(self.ids)
+prop score
This property returns the score of the candidate.
@property
+def score(self):
+ """
+ This property returns the score of the candidate.
+ """
+ return len(self.ids)
+stylotool.src.freestylo.MGHPreprocessor
+class MGHPreprocessor
+
This class preprocesses Middle High German text.
+Constructor for the MGHPreprocessor class.
class MGHPreprocessor:
+ """
+ This class preprocesses Middle High German text.
+ """
+ def __init__(self):
+ """
+ Constructor for the MGHPreprocessor class.
+ """
+ self.text = ""
+ self.model = fasttext.load_model(get_model_path("fasttext_mgh.bin"))
+ pass
+
+ # make class callable with ()
+ def __call__(self, text):
+ """
+ This method preprocesses Middle High German text.
+
+ Parameters
+ ----------
+ text : str
+ The text to be preprocessed.
+
+ Returns
+ -------
+ list
+ A list of MGH tokens.
+ """
+ self.text = normalize_middle_high_german(text)
+
+ tokens = []
+
+ idx = 0
+ pos_tagger = POSTag('middle_high_german')
+ lemmatizer = BackoffMHGLemmatizer()
+ # custom tokenizer, because I need the character index of the word
+ while True:
+ word, next_idx = self.get_next_word(self.text, idx)
+
+ pos = pos_tagger.tag_tnt(word)[0][1]
+
+ lemma = min(lemmatizer.lemmatize([word])[0][1], key=len)
+
+ dep = ""
+
+ vector = self.model.get_word_vector(word)
+
+
+ tokens.append(MGHToken(word, pos, lemma, dep, vector, idx))
+
+ if next_idx is None:
+ break
+ idx = next_idx
+ return tokens
+
+
+
+ def get_next_word(self, text, idx):
+ """
+ This method finds the next word in a text.
+
+ Parameters
+ ----------
+ text : list[str]
+ The text to be searched.
+ idx : int
+ The index of the current word.
+
+ Returns
+ -------
+ str
+ The next word in the text.
+ int
+ The index of the next word.
+ """
+ cursor = idx
+ is_end = False
+ # find end of current word
+ while cursor < len(text):
+ try:
+ if text[cursor] in [" ", "\n", "\t"]:
+ break
+ except: # end of text
+ is_end = True
+ break
+ cursor += 1
+
+ end_word = cursor
+
+ #find start of next word
+ while cursor < len(text):
+ try:
+ if text[cursor] not in [" ", "\n", "\t"]:
+ break
+ except:
+ is_end = True
+ break
+ cursor += 1
+
+ next_word = cursor
+
+ if cursor == len(text):
+ next_word = None
+
+ word = text[idx:end_word]
+
+ return word, next_word
+
+def get_next_word(self, text, idx)
+
This method finds the next word in a text.
+text
: list[str]
idx
: int
str
int
+class MGHToken
+(text, pos, lemma, dep, vector, idx)
+
This class represents a Middle High German token.
+Constructor for the MGHToken class.
+text
: str
pos
: str
lemma
: str
dep
: str
vector
: np.array
idx
: int
class MGHToken:
+ """
+ This class represents a Middle High German token.
+ """
+ def __init__(self, text, pos, lemma, dep, vector, idx):
+ """
+ Constructor for the MGHToken class.
+
+ Parameters
+ ----------
+ text : str
+ The text of the token.
+ pos : str
+ The part of speech of the token.
+ lemma : str
+ The lemma of the token.
+ dep : str
+ The dependency of the token.
+ vector : np.array
+ The vector representation of the token.
+ idx : int
+ The index of the token in the text.
+ """
+ self.text = text
+ self.pos = pos
+ self.lemma = lemma
+ self.dep = dep
+ self.vector = vector
+ self.idx = idx
+stylotool.src.freestylo.MetaphorAnnotation
+def cosine_distance(a, b)
+
This function calculates the cosine distance between two vectors.
+a
: torch.Tensor
b
: torch.Tensor
float
+class MetaphorAnnotation
+(text)
+
This class is used to find metaphor candidates in a text. +It uses the TextObject class to store the text and its annotations.
+Constructor for the MetaphorAnnotation class.
+text
: TextObject
class MetaphorAnnotation:
+ """
+ This class is used to find metaphor candidates in a text.
+ It uses the TextObject class to store the text and its annotations.
+ """
+ def __init__(self, text):
+ """
+ Constructor for the MetaphorAnnotation class.
+
+ Parameters
+ ----------
+ text : TextObject
+ The text to be analyzed.
+ """
+ self.text = text
+ text.annotations.append(self)
+ self.candidates = []
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ self.type = "metaphor"
+ self.model = None
+
+ def find_candidates(self):
+ """
+ This method finds metaphor candidates in the text.
+ """
+ pos = self.text.pos
+ for i in range(len(pos)-1):
+ if pos[i] == "ADJ" and pos[i+1] == "NOUN":
+ self.candidates.append(MetaphorCandidate(i, i+1))
+
+ def serialize(self) -> list:
+ """
+ This method serializes the metaphor candidates.
+
+ Returns
+ -------
+ list
+ A list of dictionaries, each containing the ids of the adjective and noun, the adjective, the noun, and the score.
+ """
+ candidates = []
+ for c in self.candidates:
+ candidates.append({
+ "ids": c.ids,
+ "adjective": c.adj_id,
+ "noun": c.noun_id,
+ "score": c.score})
+ return candidates
+
+
+ def load_model(self, model_path):
+ """
+ This method loads a model for metaphor detection.
+
+ Parameters
+ ----------
+ model_path : str
+ The path to the model.
+ """
+ model_path = get_model_path(model_path)
+ self.model = SimilarityNN.SimilarityNN(300, 128, 1, 128, self.device)
+ self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device))
+ self.model = self.model.to(self.device)
+ self.model.eval()
+
+ def get_vectors(self):
+ """
+ This method returns the vectors of the adjective and noun candidates.
+
+ Returns
+ -------
+ np.array
+ An array of adjective vectors.
+ np.array
+ An array of noun vectors.
+ """
+ adj_vectors = []
+ noun_vectors = []
+ for candidate in self.candidates:
+ adj_vectors.append(self.text.vectors[candidate.ids[0]])
+ noun_vectors.append(self.text.vectors[candidate.ids[1]])
+
+ adj_vectors = np.array(adj_vectors)
+ noun_vectors = np.array(noun_vectors)
+ return adj_vectors, noun_vectors
+
+ def score_candidates(self):
+ """
+ This method scores the metaphor candidates.
+ """
+ adj_vectors, noun_vectors = self.get_vectors()
+ adj_tensor = torch.tensor(adj_vectors, device=self.device).to(self.device)
+ noun_tensor = torch.tensor(noun_vectors, device=self.device).to(self.device)
+ assert(self.model is not None)
+ adj_metaphor_tensor = self.model(adj_tensor)
+ noun_metaphor_tensor = self.model(noun_tensor)
+ #scores = 1-(torch.nn.CosineSimilarity()(adj_metaphor_tensor, noun_metaphor_tensor)+1)/2
+ scores = cosine_distance(adj_metaphor_tensor, noun_metaphor_tensor)
+ for score, candidate in zip(scores, self.candidates):
+ candidate.score = score.item()
+
+def find_candidates(self)
+
This method finds metaphor candidates in the text.
+def get_vectors(self)
+
This method returns the vectors of the adjective and noun candidates.
+np.array
np.array
+def load_model(self, model_path)
+
This method loads a model for metaphor detection.
+model_path
: str
+def score_candidates(self)
+
This method scores the metaphor candidates.
+def serialize(self) ‑> list
+
This method serializes the metaphor candidates.
+list
+class MetaphorCandidate
+(adj_id, noun_id)
+
This class represents a metaphor candidate.
+Constructor for the MetaphorCandidate class.
+adj_id
: int
noun_id
: int
class MetaphorCandidate():
+ """
+ This class represents a metaphor candidate.
+ """
+ def __init__(self, adj_id, noun_id):
+ """
+ Constructor for the MetaphorCandidate class.
+
+ Parameters
+ ----------
+ adj_id : int
+ The id of the adjective.
+ noun_id : int
+ The id of the noun.
+ """
+ self.ids = [adj_id, noun_id]
+ self.noun_id = noun_id
+ self.adj_id = adj_id
+ self.score = None
+stylotool.src.freestylo.PolysyndetonAnnotation
+class PolysyndetonAnnotation
+(text: freestylo.TextObject.TextObject, min_length=2, conj=['and', 'or', 'but', 'nor'], sentence_end_tokens=['.', '?', '!', ':', ';', '...'], punct_pos='PUNCT')
+
This class is used to find polysyndeton candidates in a text. +It uses the TextObject class to store the text and its annotations.
+Constructor for the PolysyndetonAnnotation class.
+text
: TextObject
min_length
: int
, optionalconj
: list
, optionalsentence_end_tokens
: list
, optionalpunct_pos
: str
, optionalclass PolysyndetonAnnotation:
+ """
+ This class is used to find polysyndeton candidates in a text.
+ It uses the TextObject class to store the text and its annotations.
+ """
+ def __init__(self, text : TextObject, min_length=2, conj = ["and", "or", "but", "nor"], sentence_end_tokens=[".", "?", "!", ":", ";", "..."], punct_pos="PUNCT"):
+ """
+ Constructor for the PolysyndetonAnnotation class.
+
+ Parameters
+ ----------
+ text : TextObject
+ The text to be analyzed.
+ min_length : int, optional
+ The minimum length of the polysyndeton candidates.
+ conj : list, optional
+ A list of conjunctions that should be considered when looking for polysyndeton.
+ sentence_end_tokens : list, optional
+ A list of tokens that indicate the end of a sentence.
+ punct_pos : str, optional
+ The part of speech tag for punctuation.
+ """
+
+ self.text = text
+ self.candidates = []
+ self.min_length = min_length
+ self.conj = conj
+ self.sentence_end_tokens = sentence_end_tokens
+ self.punct_pos = punct_pos
+
+ def split_in_phrases(self):
+ """
+ This method splits the text into phrases.
+
+ Returns
+ -------
+ list
+ A list of lists, each containing the start and end index of a phrase.
+ """
+
+ phrases_in_sentences = []
+ phrases = []
+ current_sentence_start = 0
+ current_phrase_start = 0
+ for i, token in enumerate(self.text.tokens):
+ if token in self.sentence_end_tokens:
+ phrases.append([current_phrase_start, i])
+ current_phrase_start = i+1
+ current_sentence_start = i+1
+ phrases_in_sentences.append(phrases)
+ phrases = []
+ elif token in self.conj and i-current_phrase_start > 1:
+ phrases.append([current_phrase_start, i])
+ current_phrase_start = i
+ return phrases_in_sentences
+
+ def check_add_candidate(self, candidates, candidate):
+ """
+ This method checks if the candidate is long enough to be a polysyndeton candidate.
+
+ Parameters
+ ----------
+ candidates : list
+ A list of polysyndeton candidates.
+ """
+ if len(candidate.ids) >= self.min_length:
+ candidates.append(candidate)
+ return candidates
+
+
+
+ def find_candidates(self):
+ """
+ This method finds polysyndeton candidates in the text.
+ """
+ candidates = []
+ sentences = self.split_in_phrases()
+ for sentence in sentences:
+ current_candidate = PolysyndetonCandidate([], "")
+ current_word = ""
+ for phrase in sentence:
+ word = self.text.tokens[phrase[0]]
+ if word != current_candidate.word:
+ candidates = self.check_add_candidate(candidates, current_candidate)
+ current_candidate = PolysyndetonCandidate([phrase], word)
+ else:
+ current_candidate.ids.append(phrase)
+ candidates = self.check_add_candidate(candidates, current_candidate)
+
+ self.candidates = []
+ for candidate in candidates:
+ if candidate.word in self.conj:
+ self.candidates.append(candidate)
+
+
+ def serialize(self) -> list:
+ """
+ This method serializes the polysyndeton candidates.
+
+ Returns
+ -------
+ list
+ A list of dictionaries, each containing the ids, word, and score of a polysyndeton candidate.
+ """
+ candidates = []
+ for c in self.candidates:
+ candidates.append({
+ "ids": c.ids,
+ "score": c.score,
+ "word": c.word})
+ return candidates
+
+def check_add_candidate(self, candidates, candidate)
+
This method checks if the candidate is long enough to be a polysyndeton candidate.
+candidates
: list
+def find_candidates(self)
+
This method finds polysyndeton candidates in the text.
+def serialize(self) ‑> list
+
This method serializes the polysyndeton candidates.
+list
+def split_in_phrases(self)
+
This method splits the text into phrases.
+list
+class PolysyndetonCandidate
+(ids, word)
+
This class represents a polysyndeton candidate.
+Constructor for the PolysyndetonCandidate class.
+ids
: list
word
: str
class PolysyndetonCandidate():
+ """
+ This class represents a polysyndeton candidate.
+ """
+ def __init__(self, ids, word):
+ """
+ Constructor for the PolysyndetonCandidate class.
+
+ Parameters
+ ----------
+ ids : list
+ A list of token ids that form the candidate.
+ word : str
+ The word that the candidate ends with.
+ """
+ self.ids = ids
+ self.word = word
+
+ @property
+ def score(self):
+ """
+ This property returns the score of the polysyndeton candidate.
+ """
+ return len(self.ids)
+prop score
This property returns the score of the polysyndeton candidate.
@property
+def score(self):
+ """
+ This property returns the score of the polysyndeton candidate.
+ """
+ return len(self.ids)
+stylotool.src.freestylo.SimilarityNN
+class SimilarityNN
+(input_dim, hidden_dim, num_hidden, output_dim, device)
+
This class defines a neural network for metaphor detection.
+Constructor for the SimilarityNN class.
+input_dim
: int
hidden_dim
: int
num_hidden
: int
output_dim
: int
device
: str
class SimilarityNN(nn.Module):
+ """
+ This class defines a neural network for metaphor detection.
+ """
+ def __init__(self, input_dim, hidden_dim, num_hidden, output_dim, device):
+ """
+ Constructor for the SimilarityNN class.
+
+ Parameters
+ ----------
+ input_dim : int
+ The dimension of the input.
+ hidden_dim : int
+ The dimension of the hidden layers.
+ num_hidden : int
+ The number of hidden layers.
+ output_dim : int
+ The dimension of the output.
+ device : str
+ The device to run the model on.
+ """
+ super(SimilarityNN, self).__init__()
+ self.hidden_dim = hidden_dim
+ self.num_hidden = num_hidden
+ self.output_dim = output_dim
+
+ self.input_layer = nn.Linear(input_dim, hidden_dim, device=device)
+ self.hidden_layers = nn.ModuleList()
+ for i in range(num_hidden):
+ self.hidden_layers.append(nn.Linear(hidden_dim, hidden_dim, device=device))
+ self.output_layer = nn.Linear(hidden_dim, self.output_dim, device=device)
+
+
+ def forward(self, data):
+ """
+ This method defines the forward pass of the neural network.
+
+ Parameters
+ ----------
+ data : tensor
+ The input data.
+
+ Returns
+ -------
+ tensor
+ The output of the neural network.
+ """
+ intermediate = [nn.ReLU()(self.input_layer(data))]
+ for i in range(self.num_hidden):
+ intermediate.append(nn.ReLU()(self.hidden_layers[i](intermediate[i])))
+ out = self.output_layer(intermediate[-1])
+ return out
+
+def forward(self, data) ‑> Callable[..., Any]
+
This method defines the forward pass of the neural network.
+data
: tensor
tensor
stylotool.src.freestylo.TextObject
+class TextObject
+(textfile=None, text=None, language='')
+
This class is used to store a text and its annotations.
+Constructor for the TextObject class.
+textfile
: str
, optionaltext
: str
, optionallanguage
: str
, optionalclass TextObject:
+ """
+ This class is used to store a text and its annotations.
+ """
+ def __init__(self, textfile=None, text=None, language=''):
+ """
+ Constructor for the TextObject class.
+
+ Parameters
+ ----------
+ textfile : str, optional
+ The path to a text file.
+ text : str, optional
+
+ language : str, optional
+ The language of the text.
+ """
+ self.textfile = textfile
+ self.language = language
+ self.tokens = []
+ self.pos = []
+ self.lemmas = []
+ self.dep = []
+ self.vectors = []
+ self.annotations = []
+ self.token_offsets = []
+ self.text = text
+
+ if textfile is not None:
+ try:
+ with open(textfile, 'r') as f:
+ self.text = f.read()
+ except FileNotFoundError:
+ print("File not found, no textfile loaded")
+ elif text is not None:
+ self.text = text
+
+ def save_as(self, filename):
+ """
+ This method saves the TextObject as a pickle file.
+
+ Parameters
+ ----------
+ filename : str
+ """
+ with open(filename, 'wb') as f:
+ pickle.dump(self, f)
+
+ def serialize(self, filename):
+ """
+ This method serializes the TextObject as a JSON file.
+
+ Parameters
+ ----------
+ filename : str
+ """
+ with open(filename, 'w') as f:
+ annotations = {}
+ for anno in self.annotations:
+ annotations[anno.type] = anno.serialize()
+ save_dict = {
+ 'text': self.text,
+ 'tokens': self.tokens,
+ 'pos': self.pos,
+ 'lemmas': self.lemmas,
+ 'dep': self.dep,
+ 'token_offsets': self.token_offsets,
+ 'annotations': annotations
+ }
+ with open(filename, 'w') as f:
+ json.dump(save_dict, f, indent=4)
+
+
+ def has_text(self):
+ """
+ This method checks if the TextObject has a text.
+ """
+ return len(self.text) > 0
+
+ def has_tokens(self):
+ """
+ This method checks if the TextObject has tokens.
+ """
+ return len(self.tokens) > 0
+
+ def has_pos(self):
+ """
+ This method checks if the TextObject has part-of-speech tags.
+ """
+ return len(self.pos) > 0
+
+ def has_lemmas(self):
+ """
+ This method checks if the TextObject has lemmas.
+ """
+ return len(self.lemmas) > 0
+
+ def has_dep(self):
+ """
+ This method checks if the TextObject has dependency relations.
+ """
+ return len(self.dep) > 0
+
+ def has_vectors(self):
+ """
+ This method checks if the TextObject has vectors.
+ """
+ return len(self.vectors) > 0
+
+ def has_annotations(self):
+ """
+ This method checks if the TextObject has annotations.
+ """
+ return len(self.annotations) > 0
+
+def has_annotations(self)
+
This method checks if the TextObject has annotations.
+def has_dep(self)
+
This method checks if the TextObject has dependency relations.
+def has_lemmas(self)
+
This method checks if the TextObject has lemmas.
+def has_pos(self)
+
This method checks if the TextObject has part-of-speech tags.
+def has_text(self)
+
This method checks if the TextObject has a text.
+def has_tokens(self)
+
This method checks if the TextObject has tokens.
+def has_vectors(self)
+
This method checks if the TextObject has vectors.
+def save_as(self, filename)
+
This method saves the TextObject as a pickle file.
+filename
: str
+def serialize(self, filename)
+
This method serializes the TextObject as a JSON file.
+filename
: str
stylotool.src.freestylo.TextPreprocessor
+class TextPreprocessor
+(language='en')
+
This class is used to preprocess text. +It uses the TextObject class to store the text and its annotations.
+Constructor for the TextPreprocessor class.
+language
: str
, optionalclass TextPreprocessor:
+ """
+ This class is used to preprocess text.
+ It uses the TextObject class to store the text and its annotations.
+ """
+ def __init__(self, language='en'):
+ """
+ Constructor for the TextPreprocessor class.
+
+ Parameters
+ ----------
+ language : str, optional
+ The language of the text.
+ """
+
+ if language == 'en':
+ self.nlp = self.load_spacy_nlp('en_core_web_lg')
+ elif language == 'de':
+ self.nlp = self.load_spacy_nlp('de_core_news_lg')
+ elif language == 'mgh':
+ from MGHPreprocessor import MGHPreprocessor
+ self.nlp = MGHPreprocessor()
+
+
+ def load_spacy_nlp(self, model_name):
+ """
+ This method loads a spaCy model.
+
+ Parameters
+ ----------
+ model_name : str
+ The name of the spaCy model.
+
+ Returns
+ -------
+ spacy.lang
+ The spaCy model.
+ """
+ nlp = None
+ while nlp is None:
+ try:
+ nlp = spacy.load(model_name)
+ except:
+ try:
+ spacy.cli.download(model_name)
+ except:
+ print(f"ERROR: Could not download model {model_name}")
+ exit(1)
+ return nlp
+
+
+ def process_text(self, text : TextObject):
+ """
+ This method processes a text.
+ """
+ processed = self.nlp(text.text)
+ try:
+ text.tokens = [token.text for token in processed]
+ except:
+ print("No tokens available")
+
+ try:
+ text.pos = [token.pos_ for token in processed]
+ except:
+ print("No POS available")
+
+ try:
+ text.lemmas = [token.lemma_ for token in processed]
+ except:
+ print("No lemmas available")
+
+ try:
+ text.dep = [token.dep_ for token in processed]
+ except:
+ print("No dependencies available")
+
+ try:
+ text.vectors = [token.vector for token in processed]
+ except:
+ print("No vectors available")
+
+ try:
+ text.token_offsets = [(token.idx, token.idx + len(token.text)) for token in processed]
+ except:
+ print("No token offsets available")
+
+def load_spacy_nlp(self, model_name)
+
This method loads a spaCy model.
+model_name
: str
spacy.lang
+def process_text(self, text: freestylo.TextObject.TextObject)
+
This method processes a text.
stylotool.src.freestylo.freestylo_main
+def add_alliteration_annotation(text, config)
+
This function adds alliteration annotations to the text.
+def add_chiasmus_annotation(text, config)
+
This function adds chiasmus annotations to the text.
+def add_epiphora_annotation(text, config)
+
This function adds epiphora annotations to the text.
+def add_metaphor_annotation(text, config)
+
This function adds metaphor annotations to the text.
+def add_polysyndeton_annotation(text, config)
+
This function adds polysyndeton annotations to the text.
+def main()
+
This is the main function of the freestylo tool. +When you run the tool from the command line, this function is called. +It reads the input text, preprocesses it, and adds the specified annotations. +The results are then serialized to a file.
stylotool.src.freestylo
stylotool.src.freestylo.AlliterationAnnotation
stylotool.src.freestylo.ChiasmusAnnotation
stylotool.src.freestylo.Configs
stylotool.src.freestylo.EpiphoraAnnotation
stylotool.src.freestylo.MGHPreprocessor
stylotool.src.freestylo.MetaphorAnnotation
stylotool.src.freestylo.PolysyndetonAnnotation
stylotool.src.freestylo.SimilarityNN
stylotool.src.freestylo.TextObject
stylotool.src.freestylo.TextPreprocessor
stylotool.src.freestylo.freestylo_main
stylotool.src
stylotool.src.freestylo
stylotool.test
stylotool.test.test_alliteration_annotation
stylotool.test.test_chiasmus_annotation
stylotool.test.test_epiphora_annotation
stylotool.test.test_metaphor_annotations
stylotool.test.test_polysyndeton_annotation
stylotool.test.test_text_object
stylotool.test.test_alliteration_annotation
+def test_alliteration_annotation()
+
stylotool.test.test_chiasmus_annotation
+def test_chiasmus_annotation()
+
stylotool.test.test_epiphora_annotation
+def test_epiphora_annotation()
+
stylotool.test.test_metaphor_annotations
+def test_metaphor_annotation()
+
stylotool.test.test_polysyndeton_annotation
+def test_polysyndeton_annotation()
+
stylotool.test.test_text_object
+def test_processing()
+