Skip to content

Commit d7356db

Browse files
Kevin ClarkStanford NLP
Kevin Clark
authored and
Stanford NLP
committed
Merge branch 'master' of origin
1 parent 58182e9 commit d7356db

File tree

67 files changed

+85902
-85461
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+85902
-85461
lines changed

doc/corenlp/pom-full.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,15 @@
8585
</dependency>
8686

8787
<dependency>
88-
<groupId>com.io7m.xom</groupId>
88+
<groupId>xom</groupId>
8989
<artifactId>xom</artifactId>
90-
<version>1.2.10</version>
90+
<version>1.3.2</version>
9191
</dependency>
9292

9393
<dependency>
9494
<groupId>joda-time</groupId>
9595
<artifactId>joda-time</artifactId>
96-
<version>2.9.4</version>
96+
<version>2.10.5</version>
9797
</dependency>
9898

9999
<dependency>

itest/src/edu/stanford/nlp/ie/NumberNormalizerITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ public static void runOnceBeforeClass() {
2828
System.err.println("Setting up pipeline in @BeforeClasss");
2929
}
3030
pipeline = new AnnotationPipeline();
31-
pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
31+
pipeline.addAnnotator(new TokenizerAnnotator(false, "en", "invertible,splitHyphenated=false"));
3232
pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
3333
pipeline.addAnnotator(new POSTaggerAnnotator(DefaultPaths.DEFAULT_POS_MODEL, false));
3434
}

itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public class NumberSequenceClassifierITest {
2121
private static StanfordCoreNLP makeNumericPipeline() {
2222
Properties props = new Properties();
2323
props.setProperty("annotators", "tokenize, ssplit, pos, number, qen");
24+
props.setProperty("tokenize.options", "splitHyphenated=false");
2425
props.setProperty("customAnnotatorClass.number",
2526
"edu.stanford.nlp.pipeline.NumberAnnotator");
2627
props.setProperty("customAnnotatorClass.qen",

itest/src/edu/stanford/nlp/ie/crf/CRFClassifierITest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -245,16 +245,16 @@ public class CRFClassifierITest {
245245
{ " \"anaesthetic Smith is\" ",
246246
" \"/Oanaesthetic/O Smith/PERSON is/O\"/O ",
247247
" \"anaesthetic <PERSON>Smith</PERSON> is\" ",
248-
"<wi num=\"0\" entity=\"O\">&dquot;</wi>\n" +
248+
"<wi num=\"0\" entity=\"O\">&quot;</wi>\n" +
249249
"<wi num=\"1\" entity=\"O\">anaesthetic</wi>\n" +
250250
"<wi num=\"2\" entity=\"PERSON\">Smith</wi>\n" +
251251
"<wi num=\"3\" entity=\"O\">is</wi>\n" +
252-
"<wi num=\"4\" entity=\"O\">&dquot;</wi>\n",
253-
" <wi num=\"0\" entity=\"O\">``</wi>" +
252+
"<wi num=\"4\" entity=\"O\">&quot;</wi>\n",
253+
" <wi num=\"0\" entity=\"O\">&quot;</wi>" +
254254
"<wi num=\"1\" entity=\"O\">anaesthetic</wi> " +
255255
"<wi num=\"2\" entity=\"PERSON\">Smith</wi> " +
256256
"<wi num=\"3\" entity=\"O\">is</wi>" +
257-
"<wi num=\"4\" entity=\"O\">&apos;&apos;</wi> ",
257+
"<wi num=\"4\" entity=\"O\">&quot;</wi> ",
258258
"\"/O anaesthetic/O Smith/PERSON is/O \"/O \n",
259259
"\" anaesthetic <PERSON>Smith</PERSON> is \" \n",
260260

itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ public void setUp() throws Exception {
2828
synchronized(TokenSequenceMatcherITest.class) {
2929
if (pipeline == null) {
3030
pipeline = new AnnotationPipeline();
31-
pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
31+
pipeline.addAnnotator(new TokenizerAnnotator(false, "en", "invertible,splitHyphenated=false"));
3232
pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
3333
pipeline.addAnnotator(new POSTaggerAnnotator(false));
3434
pipeline.addAnnotator(new NumberAnnotator(false, false));

itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,19 @@ public void testSimpleSentence() throws IOException {
3535
Annotation ann = new Annotation("CoNLL-U is neat. Better than XML.");
3636
pipeline.annotate(ann);
3737
String actual = new CoNLLUOutputter("enhanced").print(ann);
38-
String expected = "1\tCoNLL-U\tconll-u\tNOUN\tNN\tNumber=Sing\t3\tnsubj\t3:nsubj\t_\n" +
39-
"2\tis\tbe\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t3\tcop\t3:cop\t_\n" +
40-
"3\tneat\tneat\tADJ\tJJ\tDegree=Pos\t0\troot\t0:root\t_\n" +
41-
"4\t.\t.\tPUNCT\t.\t_\t3\tpunct\t3:punct\t_\n" +
42-
"\n" +
43-
"1\tBetter\tbetter\tADV\tRBR\tDegree=Cmp\t0\troot\t0:root\t_\n" +
44-
"2\tthan\tthan\tADP\tIN\t_\t3\tcase\t3:case\t_\n" +
45-
"3\tXML\txml\tNOUN\tNN\tNumber=Sing\t1\tobl\t1:obl:than\t_\n" +
46-
"4\t.\t.\tPUNCT\t.\t_\t1\tpunct\t1:punct\t_\n\n";
38+
String expected = "1\tCoNLL\tconll\tNOUN\tNN\tNumber=Sing\t0\troot\t0:root\t_\n" +
39+
"2\t-\t-\tPUNCT\t:\t_\t1\tpunct\t1:punct\t_\n" +
40+
"3\tU\tU\tPROPN\tNNP\tNumber=Sing\t5\tnsubj\t5:nsubj\t_\n" +
41+
"4\tis\tbe\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t5\tcop\t5:cop\t_\n" +
42+
"5\tneat\tneat\tADJ\tJJ\tDegree=Pos\t1\tappos\t1:appos\t_\n" +
43+
"6\t.\t.\tPUNCT\t.\t_\t1\tpunct\t1:punct\t_\n" +
44+
"\n" +
45+
"1\tBetter\tbetter\tADV\tRBR\tDegree=Cmp\t0\troot\t0:root\t_\n" +
46+
"2\tthan\tthan\tADP\tIN\t_\t3\tcase\t3:case\t_\n" +
47+
"3\tXML\txml\tNOUN\tNN\tNumber=Sing\t1\tobl\t1:obl:than\t_\n" +
48+
"4\t.\t.\tPUNCT\t.\t_\t1\tpunct\t1:punct\t_\n\n";
4749
assertEquals(expected, actual);
4850
}
4951

5052
}
53+

itest/src/edu/stanford/nlp/pipeline/EntityMentionsAnnotatorITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ public void testNewsText() {
181181
"[Text=China CharacterOffsetBegin=297 CharacterOffsetEnd=302 Tokens=[China-10] TokenBegin=56 TokenEnd=57 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=2 EntityMentionIndex=12 CanonicalEntityMentionIndex=12 NamedEntityTagProbs={LOCATION=0.9991144698247745}]",
182182
"[Text=one CharacterOffsetBegin=311 CharacterOffsetEnd=314 Tokens=[one-14] TokenBegin=60 TokenEnd=61 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.0 EntityType=NUMBER SentenceIndex=2 EntityMentionIndex=13 CanonicalEntityMentionIndex=13 NamedEntityTagProbs={NUMBER=-1.0}]",
183183
"[Text=10 million CharacterOffsetBegin=366 CharacterOffsetEnd=376 Tokens=[10-25, million-26] TokenBegin=71 TokenEnd=73 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.0E7 EntityType=NUMBER SentenceIndex=2 EntityMentionIndex=14 CanonicalEntityMentionIndex=14 NamedEntityTagProbs={NUMBER=-1.0}]",
184-
"[Text=British CharacterOffsetBegin=377 CharacterOffsetEnd=384 Tokens=[British-27] TokenBegin=73 TokenEnd=74 NamedEntityTag=MISC EntityType=MISC SentenceIndex=2 EntityMentionIndex=15 CanonicalEntityMentionIndex=15 NamedEntityTagProbs={MISC=0.99999912098433}]",
184+
"[Text=British CharacterOffsetBegin=377 CharacterOffsetEnd=384 Tokens=[British-27] TokenBegin=73 TokenEnd=74 NamedEntityTag=MISC EntityType=MISC SentenceIndex=2 EntityMentionIndex=15 CanonicalEntityMentionIndex=15 NamedEntityTagProbs={MISC=0.9999991248574666}]",
185185
"[Text=16.14 million CharacterOffsetBegin=393 CharacterOffsetEnd=406 Tokens=[16.14-30, million-31] TokenBegin=76 TokenEnd=78 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.614E7 EntityType=NUMBER SentenceIndex=2 EntityMentionIndex=16 CanonicalEntityMentionIndex=16 NamedEntityTagProbs={NUMBER=-1.0}]",
186186
"[Text=U.S. CharacterOffsetBegin=407 CharacterOffsetEnd=411 Tokens=[U.S.-32] TokenBegin=78 TokenEnd=79 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=2 EntityMentionIndex=17 CanonicalEntityMentionIndex=17 NamedEntityTagProbs={LOCATION=0.809576375280559}]"
187187
};
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package edu.stanford.nlp.pipeline;
2+
3+
import edu.stanford.nlp.util.Pair;
4+
import junit.framework.TestCase;
5+
6+
import java.io.ByteArrayInputStream;
7+
import java.io.ByteArrayOutputStream;
8+
import java.io.IOException;
9+
import java.io.InputStream;
10+
11+
public class MWTProtobufSerializationITest extends TestCase {
12+
13+
public String sampleText = "Le but des bandes de roulement est d'augmenter la traction. Elle est présidente du conseil " +
14+
"d'administration.";
15+
16+
public StanfordCoreNLP pipeline;
17+
18+
AnnotationSerializer serializer;
19+
20+
@Override
21+
public void setUp() {
22+
// set up pipeline and serializer
23+
pipeline = new StanfordCoreNLP("french");
24+
serializer = new ProtobufAnnotationSerializer();
25+
}
26+
27+
public void testBasicExample() throws ClassNotFoundException, IOException {
28+
// set up document
29+
CoreDocument sampleDocument = new CoreDocument(sampleText);
30+
// annotate
31+
pipeline.annotate(sampleDocument);
32+
// serialize
33+
ByteArrayOutputStream ks = new ByteArrayOutputStream();
34+
serializer.writeCoreDocument(sampleDocument, ks).close();
35+
// Read
36+
InputStream kis = new ByteArrayInputStream(ks.toByteArray());
37+
Pair<Annotation, InputStream> pair = serializer.read(kis);
38+
pair.second.close();
39+
Annotation readAnnotation = pair.first;
40+
kis.close();
41+
ProtobufAnnotationSerializerSlowITest.sameAsRead(sampleDocument.annotation(), readAnnotation);
42+
}
43+
44+
}

itest/src/edu/stanford/nlp/pipeline/QuoteAnnotatorITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ public void testMultiParagraphQuoteSingle() {
386386
" 'I am the second paragraph.\n\n" +
387387
"'I am the second to last.\n\n" +
388388
"'see there's more here.'", quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
389-
assertInnerAnnotationValues(quotes.get(0), 0, 0, 2, 3, 28);
389+
assertInnerAnnotationValues(quotes.get(0), 0, 0, 3, 3, 28);
390390
}
391391

392392
public void testMultiLineQuoteDouble() {

itest/src/edu/stanford/nlp/pipeline/TrueCaseAnnotatorITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ public void testTrueCaseAnnotator() {
6161
String text4 = "\"GOOD MORNING AMERICA FROM MCVEY!\"";
6262
String text5 = "\"good morning america from mcvey!\"";
6363
String text6 = "\"Good Morning America From McVey!\"";
64-
String[] ans4 = { "``", "Good", "Morning", "America", "from", "McVey", "!", "''" };
64+
String[] ans4 = { "\"", "Good", "Morning", "America", "from", "McVey", "!", "\"" };
6565

6666
Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit, truecase");
6767
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

itest/src/edu/stanford/nlp/process/PTBTokenizerITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class PTBTokenizerITest extends TestCase {
2424
private static void compareResults(BufferedReader testReader,
2525
List<String> goldResults) {
2626
PTBTokenizer<CoreLabel> tokenizer =
27-
new PTBTokenizer<>(testReader, new CoreLabelTokenFactory(), "");
27+
new PTBTokenizer<>(testReader, new CoreLabelTokenFactory(), "ptb3Escaping=true");
2828
List<String> testResults = new ArrayList<>();
2929
while (tokenizer.hasNext()) {
3030
CoreLabel w = tokenizer.next();

lib/jaxb-core-2.3.0.1.jar

8.89 KB
Binary file not shown.

lib/jaxb-impl-2.4.0-b180830.0438.jar

28.2 KB
Binary file not shown.

lib/joda-time.jar

13.2 KB
Binary file not shown.

lib/xom-1.2.10.jar

-306 KB
Binary file not shown.

lib/xom-1.3.2.jar

320 KB
Binary file not shown.

libsrc/joda-time-2.10.5-sources.jar

802 KB
Binary file not shown.

libsrc/joda-time-2.9-sources.jar

-756 KB
Binary file not shown.

libsrc/xom-1.2.10-src.jar

-656 KB
Binary file not shown.

libsrc/xom-1.3.2-sources.jar

310 KB
Binary file not shown.

pom-java-11.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,13 @@
8585
<dependency>
8686
<groupId>com.io7m.xom</groupId>
8787
<artifactId>xom</artifactId>
88-
<version>1.2.10</version>
88+
<version>1.3.2</version>
8989
</dependency>
9090

9191
<dependency>
9292
<groupId>joda-time</groupId>
9393
<artifactId>joda-time</artifactId>
94-
<version>2.9.4</version>
94+
<version>2.10.5</version>
9595
</dependency>
9696

9797
<dependency>

pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,15 @@
8585
</dependency>
8686

8787
<dependency>
88-
<groupId>com.io7m.xom</groupId>
88+
<groupId>xom</groupId>
8989
<artifactId>xom</artifactId>
90-
<version>1.2.10</version>
90+
<version>1.3.2</version>
9191
</dependency>
9292

9393
<dependency>
9494
<groupId>joda-time</groupId>
9595
<artifactId>joda-time</artifactId>
96-
<version>2.9.4</version>
96+
<version>2.10.5</version>
9797
</dependency>
9898

9999
<dependency>
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Convert five class sentiment data into text files of the type you
2+
# might use a sentence classifier for.
3+
4+
# The default directories are what I set up on my local machine.
5+
# -i and -o change the input and output dirs.
6+
7+
INPUT_DIR=extern_data/sentiment/sentiment-treebank
8+
OUTPUT_DIR=extern_data/sentiment/sst-processed
9+
10+
while getopts "i:o:" OPTION
11+
do
12+
case $OPTION in
13+
i)
14+
INPUT_DIR=$OPTARG
15+
;;
16+
o)
17+
OUTPUT_DIR=$OPTARG
18+
;;
19+
esac
20+
done
21+
22+
23+
echo INPUT DIR: $INPUT_DIR
24+
echo OUTPUT DIR: $OUTPUT_DIR
25+
26+
mkdir -p $OUTPUT_DIR/binary
27+
mkdir -p $OUTPUT_DIR/fiveclass
28+
29+
echo $OUTPUT_DIR/fiveclass/train-phrases.txt
30+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/train.txt > $OUTPUT_DIR/fiveclass/train-phrases.txt
31+
32+
echo $OUTPUT_DIR/fiveclass/dev-phrases.txt
33+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/dev.txt > $OUTPUT_DIR/fiveclass/dev-phrases.txt
34+
35+
echo $OUTPUT_DIR/fiveclass/test-phrases.txt
36+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/test.txt > $OUTPUT_DIR/fiveclass/test-phrases.txt
37+
38+
39+
echo $OUTPUT_DIR/fiveclass/train-roots.txt
40+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/train.txt -root_only > $OUTPUT_DIR/fiveclass/train-roots.txt
41+
42+
echo $OUTPUT_DIR/fiveclass/dev-roots.txt
43+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/dev.txt -root_only > $OUTPUT_DIR/fiveclass/dev-roots.txt
44+
45+
echo $OUTPUT_DIR/fiveclass/test-roots.txt
46+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/test.txt -root_only > $OUTPUT_DIR/fiveclass/test-roots.txt
47+
48+
49+
echo $OUTPUT_DIR/binary/train-binary-phrases.txt
50+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/train.txt -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/train-binary-phrases.txt
51+
52+
echo $OUTPUT_DIR/binary/dev-binary-phrases.txt
53+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/dev.txt -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/dev-binary-phrases.txt
54+
55+
echo $OUTPUT_DIR/binary/test-binary-phrases.txt
56+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/test.txt -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/test-binary-phrases.txt
57+
58+
echo $OUTPUT_DIR/binary/dev-binary-roots.txt
59+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/dev.txt -root_only -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/dev-binary-roots.txt
60+
61+
echo $OUTPUT_DIR/binary/test-binary-roots.txt
62+
java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/test.txt -root_only -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/test-binary-roots.txt
63+

src/edu/stanford/nlp/classify/Dataset.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ public class Dataset<L, F> extends GeneralDataset<L, F> {
5252

5353
final static Redwood.RedwoodChannels logger = Redwood.channels(Dataset.class);
5454

55+
/** we will multiply by this constant instead of divide by log(2) */
56+
private static final double LN_TO_LOG2 = 1. / Math.log(2);
57+
5558
public Dataset() {
5659
this(10);
5760
}
@@ -658,7 +661,7 @@ public double[] getInformationGains() {
658661
for (int i = 0; i < labelIndex.size(); i++) {
659662
double labelCount = labelCounter.getCount(labelIndex.get(i));
660663
double p = labelCount / size();
661-
entropy -= p * (Math.log(p) / Math.log(2));
664+
entropy -= p * Math.log(p) * LN_TO_LOG2;
662665
}
663666

664667
double[] ig = new double[featureIndex.size()];
@@ -693,11 +696,11 @@ public double[] getInformationGains() {
693696
double pNot = notFeatureLabelCount / notFeatureCount;
694697

695698
if (featureLabelCount != 0) {
696-
sumFeature += p * (Math.log(p) / Math.log(2));
699+
sumFeature += p * Math.log(p) * LN_TO_LOG2;
697700
}
698701

699702
if (notFeatureLabelCount != 0) {
700-
sumNotFeature += pNot * (Math.log(pNot) / Math.log(2));
703+
sumNotFeature += pNot * Math.log(pNot) * LN_TO_LOG2;
701704
}
702705
//System.out.println(pNot+" "+(Math.log(pNot)/Math.log(2)));
703706

src/edu/stanford/nlp/classify/LogPrior.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ public class LogPrior implements Serializable {
1616

1717
public enum LogPriorType { NULL, QUADRATIC, HUBER, QUARTIC, COSH, ADAPT, MULTIPLE_QUADRATIC }
1818

19+
private static final double LOG2 = Math.log(2);
20+
1921
public static LogPriorType getType(String name) {
2022
if (name.equalsIgnoreCase("null")) { return LogPriorType.NULL; }
2123
else if (name.equalsIgnoreCase("quadratic")) { return LogPriorType.QUADRATIC; }
@@ -298,7 +300,7 @@ public double compute(double[] x, double[] grad) {
298300
double norm = ArrayMath.norm_1(x) / sigmaSq;
299301
double d;
300302
if (norm > 30.0) {
301-
val = norm - Math.log(2);
303+
val = norm - LOG2;
302304
d = 1.0 / sigmaSq;
303305
} else {
304306
val = Math.log(Math.cosh(norm));

src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1601,15 +1601,15 @@ protected void printFeatures(IN wi, Collection<String> features) {
16011601
}
16021602

16031603
/** Print the String features generated from a token. */
1604-
protected void printFeatureLists(IN wi, Collection<List<String>> features) {
1604+
protected void printFeatureLists(IN wi, Collection<Collection<String>> features) {
16051605
if (flags.printFeatures == null || writtenNum >= flags.printFeaturesUpto) {
16061606
return;
16071607
}
16081608
printFeatureListsHelper(wi, features);
16091609
}
16101610

16111611
// Separating this method out lets printFeatureLists be inlined, which is good since it is usually a no-op.
1612-
private void printFeatureListsHelper(IN wi, Collection<List<String>> features) {
1612+
private void printFeatureListsHelper(IN wi, Collection<Collection<String>> features) {
16131613
if (cliqueWriter == null) {
16141614
cliqueWriter = IOUtils.getPrintWriterOrDie("features-" + flags.printFeatures + ".txt");
16151615
writtenNum = 0;
@@ -1622,7 +1622,7 @@ private void printFeatureListsHelper(IN wi, Collection<List<String>> features) {
16221622
+ wi.get(CoreAnnotations.GoldAnswerAnnotation.class) + '\t');
16231623
}
16241624
boolean first = true;
1625-
for (List<String> featList : features) {
1625+
for (Collection<String> featList : features) {
16261626
List<String> sortedFeatList = new ArrayList<>(featList);
16271627
Collections.sort(sortedFeatList);
16281628
for (String feat : sortedFeatList) {

src/edu/stanford/nlp/ie/crf/CRFBiasedClassifier.java

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,23 +57,19 @@ public CRFBiasedClassifier(Properties props) {
5757
public CRFBiasedClassifier(SeqClassifierFlags flags) {super(flags); }
5858

5959
@Override
60-
public CRFDatum<List<String>, CRFLabel> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) {
60+
public CRFDatum<Collection<String>, CRFLabel> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) {
6161

6262
pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
6363
PaddedList<IN> pInfo = new PaddedList<>(info, pad);
6464

65-
List<List<String>> features = new ArrayList<>();
66-
Collection<Clique> done = Generics.newHashSet();
65+
List<Collection<String>> features = new ArrayList<>();
6766
for (int i = 0; i < windowSize; i++) {
6867
List<String> featuresC = new ArrayList<>();
69-
List<Clique> windowCliques = FeatureFactory.getCliques(i, 0);
70-
windowCliques.removeAll(done);
71-
done.addAll(windowCliques);
72-
for (Clique c : windowCliques) {
68+
FeatureFactory.eachClique(i, 0, c -> {
7369
for (FeatureFactory<IN> featureFactory : featureFactories) {
7470
featuresC.addAll(featureFactory.getCliqueFeatures(pInfo, loc, c));
7571
}
76-
}
72+
});
7773
if (testTime && i==0) {
7874
// this feature is only present at test time and only appears
7975
// in cliques of size 1 (i.e., cliques with window=0)

0 commit comments

Comments
 (0)