-
Notifications
You must be signed in to change notification settings - Fork 0
/
JaccardCoefficientBasedDocumentRetrieval.py
58 lines (41 loc) · 1.38 KB
/
JaccardCoefficientBasedDocumentRetrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
from nltk.corpus import stopwords
from nltk import word_tokenize
def jaccard():
path="/home/gaurav/Desktop/IIITD/IR/Assignments/assignment2/mod_stories/"
n=0
print("----Enter Query--------")
query=input()
query_words=list(set(word_tokenize(query.lower())))
dict1 = {}
for x in os.listdir(path):
# print(path+x)
f = open(path+x, encoding='ISO-8859-1')
# print(f.read())
n=n+1
print(n)
stop_words = set(stopwords.words('english'))
doc_words=list(set(word_tokenize(f.read().lower())))
for s in query_words:
if s in stop_words:
query_words.remove(s)
for s in doc_words:
if s in stop_words:
doc_words.remove(s)
query_words=set(query_words)
doc_words = set(doc_words)
# query_words=query_words-(query_words & set(stopwords))
# doc_words=doc_words-(doc_words & set(stopwords))
nom=len(query_words & doc_words)
denom=len(query_words | doc_words)
dict1[x]=nom/denom
# docs.append(dict1)
# print("Give value of k")
print (dict1)
dict1 = sorted(dict1, key=dict1.get, reverse=True)
# dict1 = sorted(dict1.items(), key=lambda kv: (kv[1], kv[0]))
print("Give value of k")
k=int(input())
print(dict1[0:k])
# ---------------------------------------
jaccard()