-
Notifications
You must be signed in to change notification settings - Fork 0
/
processor.py
136 lines (107 loc) · 4.11 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from __future__ import division
from nltk import data
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import PorterStemmer
from collections import Counter
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import math
import numpy as np
import json
# set nltk_data path so data can be found on Heroku
data.path.append('./nltk_data/')
class Document:
def __init__(self, link, title, description, processed):
self.link = link
self.title = title
self.description = description
self.processed = processed
class TextProcessor(object):
def __init__(self):
self.stop_words = stopwords.words('english')
self.porter = PorterStemmer()
self.vectorizer = TfidfVectorizer()
self.doc_count = 0
self.inverse_list = defaultdict(int) # inverse list of words to number of documents
self.doc_collection = []
self.word_list = []
self.doc_mat = None # document matrix
def process_doc(self, doc):
self.doc_count += 1
processed = []
text = doc.translate(None, string.punctuation)
words = word_tokenize(text)
for w in words:
w = w.lower()
if w not in self.stop_words:
stemmed = self.porter.stem(w)
processed.append(stemmed)
for w in set(processed):
self.inverse_list[w] += 1
return processed
def gen_matrix(self):
self.word_list = self.inverse_list.keys()
word_counts = [dict(Counter(tokens)) for tokens in self.doc_collection]
idf_weights = self._gen_idf_weights(self.word_list)
# prepare the document matrix
mat = np.zeros((self.doc_count, len(self.inverse_list)))
for index, wc in enumerate(word_counts):
for w, c in wc.items():
mat[index, self.word_list.index(w)] = c
# apply the idf-weigts to each column of the matrix corresponding to each term
for i in range(mat.shape[1]):
mat[:, i] *= idf_weights[i]
self.doc_mat = mat
def _gen_idf_weights(self, wlist):
# we could add-1 smoothing to avoid division by zero
return map(lambda w: math.log(self.doc_count / self.inverse_list[w]), wlist)
def consine_similarity(self, d1, d2):
# the denominator is equivalent to (math.sqrt(np.dot(d1, d1)) * math.sqrt(np.dot(d2, d2)))
# linnalg.norm calculates L2 norm by default
return np.dot(d1, d2) / (np.linalg.norm(d1) * np.linalg.norm(d2))
"""
using scikit-learn library #
step 1: build the document matrix with tf-idf weights for the collection
step 2: compute similarity between each document against all other documents, resulting in an n x n matrix
step 3: extract the most similar documents
"""
def build_doc_matrix(self):
# transoforms the corpus into tf-idf representation
# returning a sparse matrix
text = [' '.join(doc.processed) for doc in self.doc_collection]
self.doc_mat = self.vectorizer.fit_transform(text)
self.word_list = self.vectorizer.get_feature_names()
def compute_similarity_sklearn(self):
self.build_doc_matrix()
# .T gets you the transpose matrix and .A converts from sparse to normal dense representation
# note: no need to normalize, since Vectorizer will return normalized tf-idf
mat = self.doc_mat
return (mat * mat.T).A
def similarity_analysis(self):
sim_mat = self.compute_similarity_sklearn()
# zero out diagonal that are all one's - doc compares to themselves should be exact match
# TODO: maybe move this to utility
np.fill_diagonal(sim_mat, 0)
# get the max value of each row
most_similar_indices = sim_mat.argmax(1)
# return pairs of similar documents
return enumerate(most_similar_indices)
def get_top_ind(self, vect, num=10):
"""
find the indices of the top n items in the given vector
"""
top = np.argpartition(vect, -num)[-num:]
return top[np.argsort(vect[top])][::-1]
def map_data(self, dataset):
"""
iterate over the dataset and map each document into the document collection
"""
for doc in dataset:
title, desc = doc['title'], doc['description']
text = title + ' ' + desc # combine title and post content
doc = Document(doc['link'], title, desc, self.process_doc(text.encode('utf-8')))
self.doc_collection.append(doc)