forked from srbhr/Resume-Matcher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Distill.py
57 lines (43 loc) · 1.61 KB
/
Distill.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import nltk
import spacy
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
# Define english stopwords
stop_words = stopwords.words('english')
# load the spacy module and create a nlp object
# This need the spacy en module to be present on the system.
nlp = spacy.load('en_core_web_sm')
# proces to remove stopwords form a file, takes an optional_word list
# for the words that are not present in the stop words but the user wants them deleted.
def remove_stopwords(text, stopwords=stop_words, optional_params=False, optional_words=[]):
if optional_params:
stopwords.append([a for a in optional_words])
return [word for word in text if word not in stopwords]
def tokenize(text):
# Removes any useless punctuations from the text
text = re.sub(r'[^\w\s]', '', text)
return word_tokenize(text)
def lemmatize(text):
# the input to this function is a list
str_text = nlp(" ".join(text))
lemmatized_text = []
for word in str_text:
lemmatized_text.append(word.lemma_)
return lemmatized_text
# internal fuction, useless right now.
def _to_string(List):
# the input parameter must be a list
string = " "
return string.join(List)
def remove_tags(text, postags=['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV']):
"""
Takes in Tags which are allowed by the user and then elimnates the rest of the words
based on their Part of Speech (POS) Tags.
"""
filtered = []
str_text = nlp(" ".join(text))
for token in str_text:
if token.pos_ in postags:
filtered.append(token.text)
return filtered