-
Notifications
You must be signed in to change notification settings - Fork 0
/
NGramMaker.java
134 lines (108 loc) · 3.79 KB
/
NGramMaker.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/**
* Author: Andrew Laing
* Email: [email protected]
* Date: 26/12/2016.
*/
import java.util.ArrayList;
import java.util.regex.*;
import java.util.List;
import java.io.*;
public class NGramMaker
{
// Note this regex allows words with numbers in them as long as the word begins with a letter
private final static String wordSplitter = "[a-zA-Z]+(?:-[a-zA-Z]+)*[\\p{Alnum}]*(?:'[a-zA-Z]+)*";
// Compile the regex once because it will be reused many times
private final static Pattern pattern1 = Pattern.compile(wordSplitter);
// The default minimum and maximum sizes for the nGrams
private static int ngramMin = 2;
private static int ngramMax = 4;
public static void setNgramMin(int min) {
ngramMin = min;
}
public static void setNgramMax(int max) {
ngramMax = max;
}
/**
* The method getTokens tokenises the string passed and adds the tokens
* to the List passed
* @param tokens The List to add the tokens to
* @param toSplit The string to tokenise
*/
public static void getTokens(List tokens, String toSplit)
{
// replace abnormal punctuations to normal ones
toSplit = toSplit.replace('’', '\'');
toSplit = toSplit.replace('','-');
Matcher match = pattern1.matcher(toSplit);
while (match.find()) {
tokens.add(match.group());
}
}
/**
* The method getTokens tokenises a file's contents and adds the tokens
* to the List passed
* @param tokens The List to add the tokens to
* @param filename The name of the file to tokenise
* @throws IOException
*/
public static void tokeniseFile(List tokens, String filename) throws IOException
{
String line;
// Open the file
FileReader freader = new FileReader(filename);
BufferedReader inputFile = new BufferedReader(freader);
// Read the first name from the file
line = inputFile.readLine();
// Process and read lines until no more are found in the file
while(line != null)
{
getTokens(tokens, line);
line = inputFile.readLine();
}
// Close the file
inputFile.close();
}
/**
* The method getNGrams stores nGrams created from a List of tokens
* into a List passed to it.
* @param tokens The List of tokens to create nGrams from
* @param nGrams The List to store the nGrams into
* @param n The size of the nGrams to create
*/
public static void getNGrams(List tokens, List nGrams, int n)
{
if(n<=0) return; // idiot checks
String temp = "";
for(int i=0; i < tokens.size()-n+1; i++)
{
for(int j=0; j<n; j++)
temp += tokens.get(i+j) + " ";
nGrams.add(temp.trim());
temp = "";
}
}
/**
* The method getNGramsFromFile stores nGrams created from a text file
* into the List passed to it.
* @param nGrams The List to store the nGrams into
* @param filename The file to read in
* @throws IOException
*/
public static void getNGramsFromFile(List nGrams, String filename) throws IOException
{
List tokens = new ArrayList();
tokeniseFile(tokens, filename);
String temp = "";
// Create nGrams and add them to the List
for(int n=ngramMin; n<=ngramMax; n++)
{
for(int i=0; i < tokens.size()-n+1; i++)
{
for(int j=0; j<n; j++)
temp += tokens.get(i+j) + " ";
nGrams.add(temp.trim());
temp = "";
}
}
}
}