-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.py
128 lines (103 loc) · 4.34 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import time
import nltk
import pandas as pd
import numpy as np
import pickle
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict
import heapq
def load(doc):
'''
Function to load a pickle file
'''
file = open(doc,'rb')
df = pickle.load(file)
file.close()
return df
def query_processing(query):
'''
Function to calculate ranking of documents.
Data structure used includes Lists, Heaps, Dictionary, and Dataframes.
Preprocessing the query in a way similar to the documents by converting into lower case, tokenizing and stemming.
Calculates the term frequency for query and calculating it's tf-idf.
Calculates the cosine similarity between the documents and query and ranking the same using a heap.
Takes query as parameter and returns a list of top 10 highest ranked documents.
'''
no_of_doc = 34886+1 # Actual number of documents + 1
# All alphabets to lower case
query= query.lower()
# Preprocessing query removing unnecessary characters
query = query.replace("\n"," ").replace("\r"," ")
query = query.replace("'s"," ")
punctuationList = '!"#$%&\()*+,-./:;<=>?@[\\]^_{|}~'
x = str.maketrans(dict.fromkeys(punctuationList," "))
query = query.translate(x)
# Tokenize
df=word_tokenize(query)
query_length=len(df)
df= [w for w in df if not w in stopwords.words('english') ]
# Stemming
ps = PorterStemmer()
df=[ps.stem(word) for word in df]
# Term frequency
query_freq = defaultdict(lambda: 0)
for token in df :
query_freq[token] +=1
# Calling helper functions to calculate tf-idf for query with documents and title as corpus
q_ls, q_tf_idf = helper("inverted_index.obj", "processed_data.obj", "Length", no_of_doc, query_length, query_freq)
q_ls_title, q_tf_idf_title = helper("inverted_index_title.obj", "processed_data_title.obj", "TitleLength", no_of_doc, query_length, query_freq)
# Cosine similarity
tf_idf = load("tf-idf.obj")
tf_idf_title = load("tf-idf_title.obj")
q_ls += q_ls_title
q_ls = np.array(q_ls)
norm_query= np.sqrt(np.sum(q_ls*q_ls))#mod for query
rank_heap=[]
for doc in range(0,no_of_doc-1) :
val=0 # Dot-product
epsilon=10e-9
d_ls = [] # Store tf-idf values for a doc
for term , value in tf_idf[doc].items() :
if (term in q_tf_idf.keys()):
val+=(value * q_tf_idf[term])
d_ls+=[value]
for term , value in tf_idf_title[doc].items() :
if (term in q_tf_idf_title.keys()):
val+=(value * q_tf_idf_title[term])
d_ls+=[value]
d_ls = np.array(d_ls)
norm_doc = np.sqrt(np.sum(d_ls*d_ls)) #mod value for doc
cosine_sim = val/(norm_doc*norm_query + epsilon)
heapq.heappush(rank_heap, (cosine_sim, doc))
req_doc= heapq.nlargest(10, rank_heap)
return req_doc
def helper(inverted_index, processed_data, length, no_of_doc, query_length, query_freq):
'''
Function to calculate tf idf for query.
Data structures used includes List, Dictionary and Dataframes.
Calculates BM25 tf-idf for query in a way similar to document.
Takes the corpus parameters and returns tf-idf values.
'''
# tf-idf for query
ii_df = load(inverted_index)
ii_df= ii_df.to_dict()
ii_df=ii_df['PostingList']
k=1.75 # parameter for BM25
b=0.75 # parameter for BM25
tdf = load(processed_data)
avg_length= tdf[length].mean() #Average length of documents
avg_length= ((no_of_doc-1)*avg_length + query_length)/no_of_doc
q_tf_idf={} # Stores only the tf-idf values that are common with the bag of words
q_ls=[] # Stores all the tf-idf values
for key , value in query_freq.items() :
tf = (value/query_length)
if key in ii_df.keys():
idf = np.log(no_of_doc/ii_df[key])
q_tf_idf[key] = idf * ( tf * (k+1) ) / (tf + k * (1- b + b * (query_length / avg_length))) *100
q_ls+= [np.log(no_of_doc) * ( tf * (k+1) ) / (tf + k * (1- b + b * (query_length / avg_length))) *100] # add the value of tf idf of each term to the list
return q_ls, q_tf_idf
# print(query_processing("bar appear enjoy face inside group work"))