|
| 1 | +/* |
| 2 | + * To change this license header, choose License Headers in Project Properties. |
| 3 | + * To change this template file, choose Tools | Templates |
| 4 | + * and open the template in the editor. |
| 5 | + */ |
| 6 | +package irproject; |
| 7 | + |
| 8 | +import java.io.BufferedReader; |
| 9 | +import java.io.File; |
| 10 | +import java.io.FileInputStream; |
| 11 | +import java.io.FileNotFoundException; |
| 12 | +import java.io.FileReader; |
| 13 | +import java.io.FileWriter; |
| 14 | +import java.io.IOException; |
| 15 | +import java.io.ObjectInputStream; |
| 16 | +import java.util.ArrayList; |
| 17 | +import java.util.Collections; |
| 18 | +import java.util.HashMap; |
| 19 | +import java.util.Iterator; |
| 20 | +import java.util.LinkedHashMap; |
| 21 | +import java.util.Map; |
| 22 | +import java.util.Scanner; |
| 23 | +import java.util.stream.Collectors; |
| 24 | + |
| 25 | +/** |
| 26 | + * |
| 27 | + * @author manth |
| 28 | + */ |
| 29 | +public class BM25PseudoRelSearchEngine { |
| 30 | + public static String[] stopwords = new String[450]; |
| 31 | + public static HashMap < String, HashMap > index = new HashMap(); |
| 32 | + public static HashMap<String,Long> doc_length=new HashMap(); |
| 33 | + public static double avg_dl; |
| 34 | + public static HashMap<String,Double> rankings=new HashMap<String,Double>(); |
| 35 | + public static HashMap < String, ArrayList > relevance = new HashMap(); |
| 36 | + public static int N; |
| 37 | + public static HashMap<String,String> test_queries; |
| 38 | + // contains document_id and the score |
| 39 | + public static void run() throws IOException, Exception { |
| 40 | + fillStopwords(); |
| 41 | + get_inverted_index(); |
| 42 | + |
| 43 | + set_avg_dl(); |
| 44 | + test_queries=get_test_queries(); |
| 45 | + |
| 46 | + get_relevant_documents("inoutforBM25pseudRel\\cacm.txt"); |
| 47 | + Scanner sc=new Scanner(System.in); |
| 48 | + |
| 49 | + N=doc_length.size(); |
| 50 | + |
| 51 | + for(String query:test_queries.keySet()) |
| 52 | + { |
| 53 | + rankings=new HashMap(); |
| 54 | + |
| 55 | + String query_term=removePunctuations(test_queries.get(query)).toLowerCase().trim();//removePunctuations(test_queries.get(query)).toLowerCase(); |
| 56 | + |
| 57 | + Map<String,Double> top_docs=compute_BM25_score_for(query_term,"Q"+query); |
| 58 | + top_docs=sortByValue(top_docs); |
| 59 | + |
| 60 | + String [] queryExpansion = new String[1000]; |
| 61 | + String [] temp = new String [1000]; |
| 62 | + String newQuery=""; |
| 63 | + int doc_to_consider=0; |
| 64 | + |
| 65 | + for(String doc_id:top_docs.keySet()) |
| 66 | + { |
| 67 | + if(doc_to_consider>10) |
| 68 | + break; |
| 69 | + |
| 70 | + temp = pseudoRelevanceFeedback(doc_id, queryExpansion); |
| 71 | + int l=0; |
| 72 | + for(int k=0;queryExpansion[k]!=null;k++) |
| 73 | + l++; |
| 74 | + for(int k=0;temp[k]!=null;k++) |
| 75 | + queryExpansion[l++]=temp[k]; |
| 76 | + doc_to_consider++; |
| 77 | + } |
| 78 | + |
| 79 | + for(int k=0;queryExpansion[k]!=null;k++) |
| 80 | + newQuery=newQuery+" "+queryExpansion[k]; |
| 81 | + |
| 82 | + newQuery+=query_term; |
| 83 | + Map<String,Double> final_result=compute_BM25_score_for(newQuery,"Q"+query); |
| 84 | + |
| 85 | + final_result=sortByValue(final_result); |
| 86 | + |
| 87 | + |
| 88 | + |
| 89 | + File f3=new File("CACMResultsForBM25+PseudoRel\\Top100\\Q"+query+"-top100.txt"); |
| 90 | + FileWriter fw=new FileWriter(f3); |
| 91 | + |
| 92 | + |
| 93 | + String to_file=to_TREC_format(final_result,"Q"+query); |
| 94 | + fw.write(to_file); |
| 95 | + |
| 96 | + fw.close(); |
| 97 | + } |
| 98 | + |
| 99 | + |
| 100 | + |
| 101 | + |
| 102 | + |
| 103 | + } |
| 104 | + public static Map<String,Double> compute_BM25_score_for(String query,String q_id) throws IOException |
| 105 | + { |
| 106 | + |
| 107 | + String query_terms[]=query.split(" "); |
| 108 | + for(String query_term: query_terms) |
| 109 | + { |
| 110 | + |
| 111 | + HashMap<String,Long> doc_to_rank=index.get(query_term); |
| 112 | + if(doc_to_rank==null) |
| 113 | + { |
| 114 | + |
| 115 | + continue; |
| 116 | + } |
| 117 | + ArrayList<String> rel_docs=relevance.get(q_id); |
| 118 | + double ni=doc_to_rank.size(); |
| 119 | + if(rel_docs==null) |
| 120 | + { |
| 121 | + |
| 122 | + continue; |
| 123 | + } |
| 124 | + double R=rel_docs.size(); |
| 125 | + double ri=0.0; |
| 126 | + // compute ri |
| 127 | + for(String rel_doc_id: rel_docs) |
| 128 | + { |
| 129 | + if(doc_to_rank.get(rel_doc_id)!=null) |
| 130 | + ri=ri+1.0; |
| 131 | + } |
| 132 | + double score=0.0; |
| 133 | + double score_num=(ri+0.5)/(R-ri+0.5); |
| 134 | + double score_denom=(ni-ri+0.5)/(N-ni-R+ri+0.5); |
| 135 | + double score_partA=score_num/score_denom; |
| 136 | + |
| 137 | + |
| 138 | + double score_partB=0; |
| 139 | + for(String doc_id:doc_to_rank.keySet()) |
| 140 | + { |
| 141 | + double fi=Double.parseDouble(doc_to_rank.get(doc_id)+" "); |
| 142 | + double K=compute_k(doc_length.get(doc_id)); |
| 143 | + score_partB=((1.2+1)*fi)/(K+fi); |
| 144 | + score=score_partA*score_partB; |
| 145 | + if(rankings.get(doc_id)==null) |
| 146 | + rankings.put(doc_id,score); |
| 147 | + else |
| 148 | + { |
| 149 | + Double initial_score=rankings.get(doc_id); |
| 150 | + |
| 151 | + rankings.put(doc_id,(initial_score+score)); |
| 152 | + } |
| 153 | + |
| 154 | + |
| 155 | + } |
| 156 | + |
| 157 | + |
| 158 | + } |
| 159 | + Map<String,Double> sorted_docs=sortByValue(rankings); |
| 160 | + |
| 161 | + |
| 162 | + |
| 163 | + |
| 164 | + |
| 165 | + |
| 166 | + |
| 167 | + |
| 168 | + |
| 169 | + return sorted_docs; |
| 170 | + |
| 171 | + } |
| 172 | + public static String to_TREC_format(Map < String, Double > t, String Q_ID) { |
| 173 | + int rank = 1; |
| 174 | + String result = ""; |
| 175 | + for (String doc: t.keySet()) { |
| 176 | + /*query_id Q0 doc_id rank CosineSim_score system_name*/ // <-----format |
| 177 | + result+=" "+Q_ID+" "+doc+" "+rank+++" "+t.get(doc)+" GOOGLE_SERVER_#234\n"; |
| 178 | + if(rank>100) |
| 179 | + break; |
| 180 | + //result += (rank++) + " " + t.get(doc) + " " + doc + "\n"; |
| 181 | + } |
| 182 | + return result; |
| 183 | + } |
| 184 | + |
| 185 | + // STEP#4: SOrt The DOCUMENTS BY DOC-SCORE IN DESCENDING ORDER. |
| 186 | +public static <K, V extends Comparable<? super V>> |
| 187 | + Map<K, V> sortByValue(Map<K, V> map) { |
| 188 | + return map.entrySet().stream().sorted(Map.Entry.comparingByValue(Collections.reverseOrder())).collect(Collectors.toMap( |
| 189 | + Map.Entry::getKey, |
| 190 | + Map.Entry::getValue, |
| 191 | + (e1, e2) -> e1, |
| 192 | + LinkedHashMap::new |
| 193 | + )); |
| 194 | +} |
| 195 | + |
| 196 | + public static void set_avg_dl() |
| 197 | + { |
| 198 | + double total_dl=0.0; |
| 199 | + for(String doc_id:doc_length.keySet()) |
| 200 | + { |
| 201 | + total_dl+=doc_length.get(doc_id); |
| 202 | + } |
| 203 | + avg_dl=total_dl/doc_length.size(); |
| 204 | + } |
| 205 | + |
| 206 | + public static double compute_k(Long dl) |
| 207 | + { |
| 208 | + double K=0.0; |
| 209 | + double B=0.75; |
| 210 | + double k1=0.2; |
| 211 | + K=(1-B); |
| 212 | + K+=(B*dl/avg_dl)*k1; |
| 213 | + return K; |
| 214 | + } |
| 215 | + |
| 216 | + |
| 217 | + public static void get_inverted_index() |
| 218 | + { |
| 219 | + try (FileInputStream fileIn = new FileInputStream("inoutforBM25pseudRel\\index.ser"); ObjectInputStream in = new ObjectInputStream(fileIn)) { |
| 220 | + index = (HashMap) in.readObject(); |
| 221 | + FileInputStream fileIn1 = new FileInputStream("inoutforBM25pseudRel\\doc_len.ser"); |
| 222 | + ObjectInputStream in1 = new ObjectInputStream(fileIn1); |
| 223 | + doc_length=(HashMap)in1.readObject(); |
| 224 | + }catch(IOException | ClassNotFoundException i) { |
| 225 | + System.out.println(" FILES NOT PROPERLY IMPORTED"); |
| 226 | + } |
| 227 | +} |
| 228 | + |
| 229 | + public static void get_relevant_documents(String PATH) { |
| 230 | + File relvance_file = new File(PATH); |
| 231 | + try { |
| 232 | + Scanner sc_r = new Scanner(relvance_file); |
| 233 | + while (sc_r.hasNext()) { |
| 234 | + String entry = sc_r.nextLine(); |
| 235 | + String entry_array[] = entry.split(" "); |
| 236 | + String q_id = entry_array[0]; |
| 237 | + String doc_id = entry_array[2]; |
| 238 | + if (relevance.get("Q" + q_id) != null) { |
| 239 | + ArrayList < String > relevant_docs = relevance.get("Q" + q_id); |
| 240 | + relevant_docs.add(doc_id); |
| 241 | + relevance.put("Q" + q_id, relevant_docs); |
| 242 | + |
| 243 | + } else { |
| 244 | + ArrayList < String > relevant_docs = new ArrayList(); |
| 245 | + relevant_docs.add(doc_id); |
| 246 | + relevance.put("Q" + q_id, relevant_docs); |
| 247 | + } |
| 248 | + } |
| 249 | + } catch (FileNotFoundException ex) { |
| 250 | + System.out.println("Please enter a valid path ...."); |
| 251 | + } |
| 252 | + |
| 253 | + } |
| 254 | + public static HashMap<String,String> get_test_queries() |
| 255 | + { |
| 256 | + HashMap test_queries=null; |
| 257 | + |
| 258 | + try { |
| 259 | + FileInputStream fileIn = new FileInputStream("inoutforBM25pseudRel\\query-test.ser"); |
| 260 | + ObjectInputStream in = new ObjectInputStream(fileIn); |
| 261 | + test_queries = (HashMap<String,String>) in.readObject(); |
| 262 | + in.close(); |
| 263 | + fileIn.close(); |
| 264 | + }catch(IOException i) { |
| 265 | + i.printStackTrace(); |
| 266 | + |
| 267 | + }catch(ClassNotFoundException c) { |
| 268 | + System.out.println("query-test not found"); |
| 269 | + c.printStackTrace(); |
| 270 | + |
| 271 | + } |
| 272 | + return test_queries; |
| 273 | + } |
| 274 | +public static String removePunctuations(String s) { |
| 275 | + String res = ""; |
| 276 | + Character previousChar = null; |
| 277 | + int i = 0; |
| 278 | + for (Character c: s.toCharArray()) { |
| 279 | + if (i == 0) { |
| 280 | + i = 1; |
| 281 | + previousChar = c; |
| 282 | + continue; |
| 283 | + } |
| 284 | + if (Character.isDigit(c) && !previousChar.equals(null)) { |
| 285 | + if (previousChar.equals('[') || |
| 286 | + previousChar.equals(']') || |
| 287 | + previousChar.equals(')') || |
| 288 | + previousChar.equals('(')) |
| 289 | + res += ' '; |
| 290 | + else |
| 291 | + res += previousChar; |
| 292 | + } else if (!previousChar.equals(null) && |
| 293 | + (Character.isLetterOrDigit(previousChar) |
| 294 | + || previousChar.equals(' ') |
| 295 | + || previousChar.equals('-'))) |
| 296 | + res += previousChar; |
| 297 | + |
| 298 | + previousChar = c; |
| 299 | + |
| 300 | + } |
| 301 | + |
| 302 | + if (!previousChar.equals(null) && (Character.isLetterOrDigit(previousChar) || |
| 303 | + previousChar.equals(' ') || |
| 304 | + previousChar.equals('-'))) |
| 305 | + res += previousChar; |
| 306 | + return res; |
| 307 | + } |
| 308 | +public static void fillStopwords()throws Exception{ |
| 309 | + int counter=0; |
| 310 | + FileReader fr = new FileReader("inoutforBM25pseudRel\\common_words.txt"); |
| 311 | + BufferedReader br = new BufferedReader(fr); |
| 312 | + Scanner sc = new Scanner(br); |
| 313 | + while(sc.hasNext()) |
| 314 | + stopwords[counter++] = sc.next(); |
| 315 | + sc.close(); |
| 316 | + } |
| 317 | + |
| 318 | +public static String[] pseudoRelevanceFeedback(String doc, String[] queryExpansion){ |
| 319 | + HashMap <String, Long> termFrequency = new HashMap<String, Long>(); |
| 320 | + String[] temp = new String[1000]; |
| 321 | + int tempCounter=0; |
| 322 | + Iterator it1 = index.entrySet().iterator(); |
| 323 | + while(it1.hasNext()){ |
| 324 | + Map.Entry pair1 = (Map.Entry)it1.next(); |
| 325 | + String term = (String)pair1.getKey(); |
| 326 | + HashMap<String, Long> value = (HashMap<String, Long>)pair1.getValue(); |
| 327 | + if(value.get(doc)==null) continue; |
| 328 | + else |
| 329 | + termFrequency.put(term, value.get(doc)); |
| 330 | + } |
| 331 | + int i=0; |
| 332 | + termFrequency = (HashMap<String, Long>) sortByValue(termFrequency); |
| 333 | + Iterator it2 = termFrequency.entrySet().iterator(); |
| 334 | + while(it2.hasNext() && tempCounter<100){ |
| 335 | + int duplicate = 0; |
| 336 | + i++; |
| 337 | + Map.Entry pair2 = (Map.Entry)it2.next(); |
| 338 | + String term2 = (String)pair2.getKey(); |
| 339 | + for(int j=0;queryExpansion[j]!=null;j++) |
| 340 | + if(term2.equals(queryExpansion[j])) |
| 341 | + duplicate=1; |
| 342 | + for(int j=0;stopwords[j]!=null;j++) |
| 343 | + if(term2.equals(stopwords[j])) |
| 344 | + duplicate=1; |
| 345 | + if(duplicate==0) |
| 346 | + temp[tempCounter++]=term2; |
| 347 | + } |
| 348 | + return temp; |
| 349 | + } |
| 350 | + |
| 351 | + |
| 352 | +} |
0 commit comments