-
Notifications
You must be signed in to change notification settings - Fork 0
/
Index.java
238 lines (204 loc) · 6.45 KB
/
Index.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
package cs276.assignments;
import cs276.util.Pair;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.LinkedList;
public class Index {
// Term id -> (position in index file, doc frequency) dictionary
private static Map<Integer, Pair<Long, Integer>> postingDict
= new TreeMap<Integer, Pair<Long, Integer>>();
// Doc name -> doc id dictionary
private static Map<String, Integer> docDict
= new TreeMap<String, Integer>();
// Term -> term id dictionary
private static Map<String, Integer> termDict
= new TreeMap<String, Integer>();
// Block queue
private static LinkedList<File> blockQueue
= new LinkedList<File>();
// Total file counter
private static int totalFileCount = 0;
// Document counter
private static int docIdCounter = 0;
// Term counter
private static int wordIdCounter = 0;
// Index
private static BaseIndex index = null;
/*
* Write a posting list to the given file
* You should record the file position of this posting list
* so that you can read it back during retrieval
*
* */
private static void writePosting(FileChannel fc, PostingList posting)
throws IOException {
/*
* TODO: Your code here
*
*/
}
public static void main(String[] args) throws IOException {
/* Parse command line */
if (args.length != 3) {
System.err
.println("Usage: java Index [Basic|VB|Gamma] data_dir output_dir");
return;
}
/* Get index */
String className = "cs276.assignments." + args[0] + "Index";
try {
Class<?> indexClass = Class.forName(className);
index = (BaseIndex) indexClass.newInstance();
} catch (Exception e) {
System.err
.println("Index method must be \"Basic\", \"VB\", or \"Gamma\"");
throw new RuntimeException(e);
}
/* Get root directory */
String root = args[1];
File rootdir = new File(root);
if (!rootdir.exists() || !rootdir.isDirectory()) {
System.err.println("Invalid data directory: " + root);
return;
}
/* Get output directory */
String output = args[2];
File outdir = new File(output);
if (outdir.exists() && !outdir.isDirectory()) {
System.err.println("Invalid output directory: " + output);
return;
}
if (!outdir.exists()) {
if (!outdir.mkdirs()) {
System.err.println("Create output directory failure");
return;
}
}
/* A filter to get rid of all files starting with .*/
FileFilter filter = new FileFilter() {
@Override
public boolean accept(File pathname) {
String name = pathname.getName();
return !name.startsWith(".");
}
};
/* BSBI indexing algorithm */
File[] dirlist = rootdir.listFiles(filter);
/* For each block */
for (File block : dirlist) {
File blockFile = new File(output, block.getName());
blockQueue.add(blockFile);
File blockDir = new File(root, block.getName());
File[] filelist = blockDir.listFiles(filter);
TreeMap<Integer, PostingList> arrangeList = new TreeMap<Integer,PostingList>();
/* For each file */
for (File file : filelist) {
++totalFileCount;
String fileName = block.getName() + "/" + file.getName();
docDict.put(fileName, docIdCounter++);
BufferedReader reader = new BufferedReader(new FileReader(file));
String line;
while ((line = reader.readLine()) != null) {
String[] tokens = line.trim().split("\\s+");
for (String token : tokens) {
/*
* TODO: Your code here
* For each term, build up a list of
* documents in which the term occurs
*/
if(termDict.get(token)==null){
termDict.put(token,wordIdCounter++);
}
int temp = termDict.get(token);
ArrayList<Integer> mylist = new ArrayList<Integer>();
mylist.add(docDict.get(fileName));
PostingList posting = new PostingList(temp,mylist);
if(arrangeList.get(temp)==null){
arrangeList.put(temp,posting);
}
else{
mylist.add(docDict.get(fileName));
arrangeList.put(temp,posting);
}
}
}
reader.close();
}
/* Sort and output */
if (!blockFile.createNewFile()) {
System.err.println("Create new block failure.");
return;
}
RandomAccessFile bfc = new RandomAccessFile(blockFile, "rw");
/*
* TODO: Your code here
* Write all posting lists for all terms to file (bfc)
*/
bfc.close();
}
/* Required: output total number of files. */
System.out.println(totalFileCount);
/* Merge blocks */
while (true) {
if (blockQueue.size() <= 1)
break;
File b1 = blockQueue.removeFirst();
File b2 = blockQueue.removeFirst();
File combfile = new File(output, b1.getName() + "+" + b2.getName());
if (!combfile.createNewFile()) {
System.err.println("Create new block failure.");
return;
}
RandomAccessFile bf1 = new RandomAccessFile(b1, "r");
RandomAccessFile bf2 = new RandomAccessFile(b2, "r");
RandomAccessFile mf = new RandomAccessFile(combfile, "rw");
/*
* TODO: Your code here
* Combine blocks bf1 and bf2 into our combined file, mf
* You will want to consider in what order to merge
* the two blocks (based on term ID, perhaps?).
*
*/
bf1.close();
bf2.close();
mf.close();
b1.delete();
b2.delete();
blockQueue.add(combfile);
}
/* Dump constructed index back into file system */
File indexFile = blockQueue.removeFirst();
indexFile.renameTo(new File(output, "corpus.index"));
BufferedWriter termWriter = new BufferedWriter(new FileWriter(new File(
output, "term.dict")));
for (String term : termDict.keySet()) {
termWriter.write(term + "\t" + termDict.get(term) + "\n");
}
termWriter.close();
BufferedWriter docWriter = new BufferedWriter(new FileWriter(new File(
output, "doc.dict")));
for (String doc : docDict.keySet()) {
docWriter.write(doc + "\t" + docDict.get(doc) + "\n");
}
docWriter.close();
BufferedWriter postWriter = new BufferedWriter(new FileWriter(new File(
output, "posting.dict")));
for (Integer termId : postingDict.keySet()) {
postWriter.write(termId + "\t" + postingDict.get(termId).getFirst()
+ "\t" + postingDict.get(termId).getSecond() + "\n");
}
postWriter.close();
}
}