Skip to content

Commit 1b9c88a

Browse files
Source code made public
1 parent cb32bad commit 1b9c88a

10 files changed

+3041
-0
lines changed
Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
/*
2+
* To change this license header, choose License Headers in Project Properties.
3+
* To change this template file, choose Tools | Templates
4+
* and open the template in the editor.
5+
*/
6+
package irproject;
7+
8+
import java.io.BufferedReader;
9+
import java.io.File;
10+
import java.io.FileInputStream;
11+
import java.io.FileNotFoundException;
12+
import java.io.FileReader;
13+
import java.io.FileWriter;
14+
import java.io.IOException;
15+
import java.io.ObjectInputStream;
16+
import java.util.ArrayList;
17+
import java.util.Collections;
18+
import java.util.HashMap;
19+
import java.util.Iterator;
20+
import java.util.LinkedHashMap;
21+
import java.util.Map;
22+
import java.util.Scanner;
23+
import java.util.stream.Collectors;
24+
25+
/**
26+
*
27+
* @author manth
28+
*/
29+
public class BM25PseudoRelSearchEngine {
30+
public static String[] stopwords = new String[450];
31+
public static HashMap < String, HashMap > index = new HashMap();
32+
public static HashMap<String,Long> doc_length=new HashMap();
33+
public static double avg_dl;
34+
public static HashMap<String,Double> rankings=new HashMap<String,Double>();
35+
public static HashMap < String, ArrayList > relevance = new HashMap();
36+
public static int N;
37+
public static HashMap<String,String> test_queries;
38+
// contains document_id and the score
39+
public static void run() throws IOException, Exception {
40+
fillStopwords();
41+
get_inverted_index();
42+
43+
set_avg_dl();
44+
test_queries=get_test_queries();
45+
46+
get_relevant_documents("inoutforBM25pseudRel\\cacm.txt");
47+
Scanner sc=new Scanner(System.in);
48+
49+
N=doc_length.size();
50+
51+
for(String query:test_queries.keySet())
52+
{
53+
rankings=new HashMap();
54+
55+
String query_term=removePunctuations(test_queries.get(query)).toLowerCase().trim();//removePunctuations(test_queries.get(query)).toLowerCase();
56+
57+
Map<String,Double> top_docs=compute_BM25_score_for(query_term,"Q"+query);
58+
top_docs=sortByValue(top_docs);
59+
60+
String [] queryExpansion = new String[1000];
61+
String [] temp = new String [1000];
62+
String newQuery="";
63+
int doc_to_consider=0;
64+
65+
for(String doc_id:top_docs.keySet())
66+
{
67+
if(doc_to_consider>10)
68+
break;
69+
70+
temp = pseudoRelevanceFeedback(doc_id, queryExpansion);
71+
int l=0;
72+
for(int k=0;queryExpansion[k]!=null;k++)
73+
l++;
74+
for(int k=0;temp[k]!=null;k++)
75+
queryExpansion[l++]=temp[k];
76+
doc_to_consider++;
77+
}
78+
79+
for(int k=0;queryExpansion[k]!=null;k++)
80+
newQuery=newQuery+" "+queryExpansion[k];
81+
82+
newQuery+=query_term;
83+
Map<String,Double> final_result=compute_BM25_score_for(newQuery,"Q"+query);
84+
85+
final_result=sortByValue(final_result);
86+
87+
88+
89+
File f3=new File("CACMResultsForBM25+PseudoRel\\Top100\\Q"+query+"-top100.txt");
90+
FileWriter fw=new FileWriter(f3);
91+
92+
93+
String to_file=to_TREC_format(final_result,"Q"+query);
94+
fw.write(to_file);
95+
96+
fw.close();
97+
}
98+
99+
100+
101+
102+
103+
}
104+
public static Map<String,Double> compute_BM25_score_for(String query,String q_id) throws IOException
105+
{
106+
107+
String query_terms[]=query.split(" ");
108+
for(String query_term: query_terms)
109+
{
110+
111+
HashMap<String,Long> doc_to_rank=index.get(query_term);
112+
if(doc_to_rank==null)
113+
{
114+
115+
continue;
116+
}
117+
ArrayList<String> rel_docs=relevance.get(q_id);
118+
double ni=doc_to_rank.size();
119+
if(rel_docs==null)
120+
{
121+
122+
continue;
123+
}
124+
double R=rel_docs.size();
125+
double ri=0.0;
126+
// compute ri
127+
for(String rel_doc_id: rel_docs)
128+
{
129+
if(doc_to_rank.get(rel_doc_id)!=null)
130+
ri=ri+1.0;
131+
}
132+
double score=0.0;
133+
double score_num=(ri+0.5)/(R-ri+0.5);
134+
double score_denom=(ni-ri+0.5)/(N-ni-R+ri+0.5);
135+
double score_partA=score_num/score_denom;
136+
137+
138+
double score_partB=0;
139+
for(String doc_id:doc_to_rank.keySet())
140+
{
141+
double fi=Double.parseDouble(doc_to_rank.get(doc_id)+" ");
142+
double K=compute_k(doc_length.get(doc_id));
143+
score_partB=((1.2+1)*fi)/(K+fi);
144+
score=score_partA*score_partB;
145+
if(rankings.get(doc_id)==null)
146+
rankings.put(doc_id,score);
147+
else
148+
{
149+
Double initial_score=rankings.get(doc_id);
150+
151+
rankings.put(doc_id,(initial_score+score));
152+
}
153+
154+
155+
}
156+
157+
158+
}
159+
Map<String,Double> sorted_docs=sortByValue(rankings);
160+
161+
162+
163+
164+
165+
166+
167+
168+
169+
return sorted_docs;
170+
171+
}
172+
public static String to_TREC_format(Map < String, Double > t, String Q_ID) {
173+
int rank = 1;
174+
String result = "";
175+
for (String doc: t.keySet()) {
176+
/*query_id Q0 doc_id rank CosineSim_score system_name*/ // <-----format
177+
result+=" "+Q_ID+" "+doc+" "+rank+++" "+t.get(doc)+" GOOGLE_SERVER_#234\n";
178+
if(rank>100)
179+
break;
180+
//result += (rank++) + " " + t.get(doc) + " " + doc + "\n";
181+
}
182+
return result;
183+
}
184+
185+
// STEP#4: SOrt The DOCUMENTS BY DOC-SCORE IN DESCENDING ORDER.
186+
public static <K, V extends Comparable<? super V>>
187+
Map<K, V> sortByValue(Map<K, V> map) {
188+
return map.entrySet().stream().sorted(Map.Entry.comparingByValue(Collections.reverseOrder())).collect(Collectors.toMap(
189+
Map.Entry::getKey,
190+
Map.Entry::getValue,
191+
(e1, e2) -> e1,
192+
LinkedHashMap::new
193+
));
194+
}
195+
196+
public static void set_avg_dl()
197+
{
198+
double total_dl=0.0;
199+
for(String doc_id:doc_length.keySet())
200+
{
201+
total_dl+=doc_length.get(doc_id);
202+
}
203+
avg_dl=total_dl/doc_length.size();
204+
}
205+
206+
public static double compute_k(Long dl)
207+
{
208+
double K=0.0;
209+
double B=0.75;
210+
double k1=0.2;
211+
K=(1-B);
212+
K+=(B*dl/avg_dl)*k1;
213+
return K;
214+
}
215+
216+
217+
public static void get_inverted_index()
218+
{
219+
try (FileInputStream fileIn = new FileInputStream("inoutforBM25pseudRel\\index.ser"); ObjectInputStream in = new ObjectInputStream(fileIn)) {
220+
index = (HashMap) in.readObject();
221+
FileInputStream fileIn1 = new FileInputStream("inoutforBM25pseudRel\\doc_len.ser");
222+
ObjectInputStream in1 = new ObjectInputStream(fileIn1);
223+
doc_length=(HashMap)in1.readObject();
224+
}catch(IOException | ClassNotFoundException i) {
225+
System.out.println(" FILES NOT PROPERLY IMPORTED");
226+
}
227+
}
228+
229+
public static void get_relevant_documents(String PATH) {
230+
File relvance_file = new File(PATH);
231+
try {
232+
Scanner sc_r = new Scanner(relvance_file);
233+
while (sc_r.hasNext()) {
234+
String entry = sc_r.nextLine();
235+
String entry_array[] = entry.split(" ");
236+
String q_id = entry_array[0];
237+
String doc_id = entry_array[2];
238+
if (relevance.get("Q" + q_id) != null) {
239+
ArrayList < String > relevant_docs = relevance.get("Q" + q_id);
240+
relevant_docs.add(doc_id);
241+
relevance.put("Q" + q_id, relevant_docs);
242+
243+
} else {
244+
ArrayList < String > relevant_docs = new ArrayList();
245+
relevant_docs.add(doc_id);
246+
relevance.put("Q" + q_id, relevant_docs);
247+
}
248+
}
249+
} catch (FileNotFoundException ex) {
250+
System.out.println("Please enter a valid path ....");
251+
}
252+
253+
}
254+
public static HashMap<String,String> get_test_queries()
255+
{
256+
HashMap test_queries=null;
257+
258+
try {
259+
FileInputStream fileIn = new FileInputStream("inoutforBM25pseudRel\\query-test.ser");
260+
ObjectInputStream in = new ObjectInputStream(fileIn);
261+
test_queries = (HashMap<String,String>) in.readObject();
262+
in.close();
263+
fileIn.close();
264+
}catch(IOException i) {
265+
i.printStackTrace();
266+
267+
}catch(ClassNotFoundException c) {
268+
System.out.println("query-test not found");
269+
c.printStackTrace();
270+
271+
}
272+
return test_queries;
273+
}
274+
public static String removePunctuations(String s) {
275+
String res = "";
276+
Character previousChar = null;
277+
int i = 0;
278+
for (Character c: s.toCharArray()) {
279+
if (i == 0) {
280+
i = 1;
281+
previousChar = c;
282+
continue;
283+
}
284+
if (Character.isDigit(c) && !previousChar.equals(null)) {
285+
if (previousChar.equals('[') ||
286+
previousChar.equals(']') ||
287+
previousChar.equals(')') ||
288+
previousChar.equals('('))
289+
res += ' ';
290+
else
291+
res += previousChar;
292+
} else if (!previousChar.equals(null) &&
293+
(Character.isLetterOrDigit(previousChar)
294+
|| previousChar.equals(' ')
295+
|| previousChar.equals('-')))
296+
res += previousChar;
297+
298+
previousChar = c;
299+
300+
}
301+
302+
if (!previousChar.equals(null) && (Character.isLetterOrDigit(previousChar) ||
303+
previousChar.equals(' ') ||
304+
previousChar.equals('-')))
305+
res += previousChar;
306+
return res;
307+
}
308+
public static void fillStopwords()throws Exception{
309+
int counter=0;
310+
FileReader fr = new FileReader("inoutforBM25pseudRel\\common_words.txt");
311+
BufferedReader br = new BufferedReader(fr);
312+
Scanner sc = new Scanner(br);
313+
while(sc.hasNext())
314+
stopwords[counter++] = sc.next();
315+
sc.close();
316+
}
317+
318+
public static String[] pseudoRelevanceFeedback(String doc, String[] queryExpansion){
319+
HashMap <String, Long> termFrequency = new HashMap<String, Long>();
320+
String[] temp = new String[1000];
321+
int tempCounter=0;
322+
Iterator it1 = index.entrySet().iterator();
323+
while(it1.hasNext()){
324+
Map.Entry pair1 = (Map.Entry)it1.next();
325+
String term = (String)pair1.getKey();
326+
HashMap<String, Long> value = (HashMap<String, Long>)pair1.getValue();
327+
if(value.get(doc)==null) continue;
328+
else
329+
termFrequency.put(term, value.get(doc));
330+
}
331+
int i=0;
332+
termFrequency = (HashMap<String, Long>) sortByValue(termFrequency);
333+
Iterator it2 = termFrequency.entrySet().iterator();
334+
while(it2.hasNext() && tempCounter<100){
335+
int duplicate = 0;
336+
i++;
337+
Map.Entry pair2 = (Map.Entry)it2.next();
338+
String term2 = (String)pair2.getKey();
339+
for(int j=0;queryExpansion[j]!=null;j++)
340+
if(term2.equals(queryExpansion[j]))
341+
duplicate=1;
342+
for(int j=0;stopwords[j]!=null;j++)
343+
if(term2.equals(stopwords[j]))
344+
duplicate=1;
345+
if(duplicate==0)
346+
temp[tempCounter++]=term2;
347+
}
348+
return temp;
349+
}
350+
351+
352+
}

0 commit comments

Comments
 (0)