stanfordnlp
diff --git a/‎doc/corenlp/pom-full.xml
Lines changed: 3 additions & 3 deletions b/‎doc/corenlp/pom-full.xml
Lines changed: 3 additions & 3 deletions
diff --git a/‎itest/src/edu/stanford/nlp/ie/NumberNormalizerITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/ie/NumberNormalizerITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java
Lines changed: 1 addition & 0 deletions b/‎itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java
Lines changed: 1 addition & 0 deletions
diff --git a/‎itest/src/edu/stanford/nlp/ie/crf/CRFClassifierITest.java
Lines changed: 4 additions & 4 deletions b/‎itest/src/edu/stanford/nlp/ie/crf/CRFClassifierITest.java
Lines changed: 4 additions & 4 deletions
diff --git a/‎itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java
Lines changed: 12 additions & 9 deletions b/‎itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java
Lines changed: 12 additions & 9 deletions
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/EntityMentionsAnnotatorITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/pipeline/EntityMentionsAnnotatorITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/MWTProtobufSerializationITest.java
Lines changed: 44 additions & 0 deletions b/‎itest/src/edu/stanford/nlp/pipeline/MWTProtobufSerializationITest.java
Lines changed: 44 additions & 0 deletions
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/QuoteAnnotatorITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/pipeline/QuoteAnnotatorITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/TrueCaseAnnotatorITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/pipeline/TrueCaseAnnotatorITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎itest/src/edu/stanford/nlp/process/PTBTokenizerITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/process/PTBTokenizerITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/jaxb-core-2.3.0.1.jar
8.89 KB b/‎lib/jaxb-core-2.3.0.1.jar
8.89 KB
diff --git a/‎lib/jaxb-impl-2.4.0-b180830.0438.jar
28.2 KB b/‎lib/jaxb-impl-2.4.0-b180830.0438.jar
28.2 KB
diff --git a/‎lib/joda-time.jar
13.2 KB b/‎lib/joda-time.jar
13.2 KB
diff --git a/‎lib/xom-1.2.10.jar
-306 KB b/‎lib/xom-1.2.10.jar
-306 KB
diff --git a/‎lib/xom-1.3.2.jar
320 KB b/‎lib/xom-1.3.2.jar
320 KB
diff --git a/‎libsrc/joda-time-2.10.5-sources.jar
802 KB b/‎libsrc/joda-time-2.10.5-sources.jar
802 KB
diff --git a/‎libsrc/joda-time-2.9-sources.jar
-756 KB b/‎libsrc/joda-time-2.9-sources.jar
-756 KB
diff --git a/‎libsrc/xom-1.2.10-src.jar
-656 KB b/‎libsrc/xom-1.2.10-src.jar
-656 KB
diff --git a/‎libsrc/xom-1.3.2-sources.jar
310 KB b/‎libsrc/xom-1.3.2-sources.jar
310 KB
diff --git a/‎pom-java-11.xml
Lines changed: 2 additions & 2 deletions b/‎pom-java-11.xml
Lines changed: 2 additions & 2 deletions
diff --git a/‎pom.xml
Lines changed: 3 additions & 3 deletions b/‎pom.xml
Lines changed: 3 additions & 3 deletions
diff --git a/‎scripts/sentiment/convert_sentiment.sh
Lines changed: 63 additions & 0 deletions b/‎scripts/sentiment/convert_sentiment.sh
Lines changed: 63 additions & 0 deletions
diff --git a/‎src/edu/stanford/nlp/classify/Dataset.java
Lines changed: 6 additions & 3 deletions b/‎src/edu/stanford/nlp/classify/Dataset.java
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/edu/stanford/nlp/classify/LogPrior.java
Lines changed: 3 additions & 1 deletion b/‎src/edu/stanford/nlp/classify/LogPrior.java
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
Lines changed: 3 additions & 3 deletions b/‎src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/edu/stanford/nlp/ie/crf/CRFBiasedClassifier.java
Lines changed: 4 additions & 8 deletions b/‎src/edu/stanford/nlp/ie/crf/CRFBiasedClassifier.java
Lines changed: 4 additions & 8 deletions
@@ -85,15 +85,15 @@
     </dependency>
 
     <dependency>
-      <groupId>com.io7m.xom</groupId>
+      <groupId>xom</groupId>
       <artifactId>xom</artifactId>
-      <version>1.2.10</version>
+      <version>1.3.2</version>
     </dependency>
 
     <dependency>
       <groupId>joda-time</groupId>
       <artifactId>joda-time</artifactId>
-      <version>2.9.4</version>
+      <version>2.10.5</version>
     </dependency>
 
     <dependency>
 
@@ -28,7 +28,7 @@ public static void runOnceBeforeClass() {
       System.err.println("Setting up pipeline in @BeforeClasss");
     }
     pipeline = new AnnotationPipeline();
-    pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
+    pipeline.addAnnotator(new TokenizerAnnotator(false, "en", "invertible,splitHyphenated=false"));
     pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
     pipeline.addAnnotator(new POSTaggerAnnotator(DefaultPaths.DEFAULT_POS_MODEL, false));
   }
 
@@ -21,6 +21,7 @@ public class NumberSequenceClassifierITest {
   private static StanfordCoreNLP makeNumericPipeline() {
     Properties props = new Properties();
     props.setProperty("annotators", "tokenize, ssplit, pos, number, qen");
+    props.setProperty("tokenize.options", "splitHyphenated=false");
     props.setProperty("customAnnotatorClass.number",
         "edu.stanford.nlp.pipeline.NumberAnnotator");
     props.setProperty("customAnnotatorClass.qen",
 
@@ -245,16 +245,16 @@ public class CRFClassifierITest {
                   { "  \"anaesthetic  Smith is\"  ",
                           "  \"/Oanaesthetic/O  Smith/PERSON is/O\"/O  ",
                           "  \"anaesthetic  <PERSON>Smith</PERSON> is\"  ",
-                          "<wi num=\"0\" entity=\"O\">&dquot;</wi>\n" +
+                          "<wi num=\"0\" entity=\"O\">&quot;</wi>\n" +
                                   "<wi num=\"1\" entity=\"O\">anaesthetic</wi>\n" +
                                   "<wi num=\"2\" entity=\"PERSON\">Smith</wi>\n" +
                                   "<wi num=\"3\" entity=\"O\">is</wi>\n" +
-                                  "<wi num=\"4\" entity=\"O\">&dquot;</wi>\n",
-                          "  <wi num=\"0\" entity=\"O\">``</wi>" +
+                                  "<wi num=\"4\" entity=\"O\">&quot;</wi>\n",
+                          "  <wi num=\"0\" entity=\"O\">&quot;</wi>" +
                                   "<wi num=\"1\" entity=\"O\">anaesthetic</wi>  " +
                                   "<wi num=\"2\" entity=\"PERSON\">Smith</wi> " +
                                   "<wi num=\"3\" entity=\"O\">is</wi>" +
-                                  "<wi num=\"4\" entity=\"O\">&apos;&apos;</wi>  ",
+                                  "<wi num=\"4\" entity=\"O\">&quot;</wi>  ",
                           "\"/O anaesthetic/O Smith/PERSON is/O \"/O \n",
                           "\" anaesthetic <PERSON>Smith</PERSON> is \" \n",
 
 
@@ -28,7 +28,7 @@ public void setUp() throws Exception {
     synchronized(TokenSequenceMatcherITest.class) {
       if (pipeline == null) {
         pipeline = new AnnotationPipeline();
-        pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
+        pipeline.addAnnotator(new TokenizerAnnotator(false, "en", "invertible,splitHyphenated=false"));
         pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
         pipeline.addAnnotator(new POSTaggerAnnotator(false));
         pipeline.addAnnotator(new NumberAnnotator(false, false));
 
@@ -35,16 +35,19 @@ public void testSimpleSentence() throws IOException {
         Annotation ann = new Annotation("CoNLL-U is neat. Better than XML.");
         pipeline.annotate(ann);
         String actual = new CoNLLUOutputter("enhanced").print(ann);
-        String expected = "1\tCoNLL-U\tconll-u\tNOUN\tNN\tNumber=Sing\t3\tnsubj\t3:nsubj\t_\n" +
-                "2\tis\tbe\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t3\tcop\t3:cop\t_\n" +
-                "3\tneat\tneat\tADJ\tJJ\tDegree=Pos\t0\troot\t0:root\t_\n" +
-                "4\t.\t.\tPUNCT\t.\t_\t3\tpunct\t3:punct\t_\n" +
-                "\n" +
-                "1\tBetter\tbetter\tADV\tRBR\tDegree=Cmp\t0\troot\t0:root\t_\n" +
-                "2\tthan\tthan\tADP\tIN\t_\t3\tcase\t3:case\t_\n" +
-                "3\tXML\txml\tNOUN\tNN\tNumber=Sing\t1\tobl\t1:obl:than\t_\n" +
-                "4\t.\t.\tPUNCT\t.\t_\t1\tpunct\t1:punct\t_\n\n";
+        String expected = "1\tCoNLL\tconll\tNOUN\tNN\tNumber=Sing\t0\troot\t0:root\t_\n" +
+            "2\t-\t-\tPUNCT\t:\t_\t1\tpunct\t1:punct\t_\n" +
+            "3\tU\tU\tPROPN\tNNP\tNumber=Sing\t5\tnsubj\t5:nsubj\t_\n" +
+            "4\tis\tbe\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t5\tcop\t5:cop\t_\n" +
+            "5\tneat\tneat\tADJ\tJJ\tDegree=Pos\t1\tappos\t1:appos\t_\n" +
+            "6\t.\t.\tPUNCT\t.\t_\t1\tpunct\t1:punct\t_\n" +
+            "\n" +
+            "1\tBetter\tbetter\tADV\tRBR\tDegree=Cmp\t0\troot\t0:root\t_\n" +
+            "2\tthan\tthan\tADP\tIN\t_\t3\tcase\t3:case\t_\n" +
+            "3\tXML\txml\tNOUN\tNN\tNumber=Sing\t1\tobl\t1:obl:than\t_\n" +
+            "4\t.\t.\tPUNCT\t.\t_\t1\tpunct\t1:punct\t_\n\n";
         assertEquals(expected, actual);
     }
 
 }
+
@@ -181,7 +181,7 @@ public void testNewsText() {
         "[Text=China CharacterOffsetBegin=297 CharacterOffsetEnd=302 Tokens=[China-10] TokenBegin=56 TokenEnd=57 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=2 EntityMentionIndex=12 CanonicalEntityMentionIndex=12 NamedEntityTagProbs={LOCATION=0.9991144698247745}]",
         "[Text=one CharacterOffsetBegin=311 CharacterOffsetEnd=314 Tokens=[one-14] TokenBegin=60 TokenEnd=61 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.0 EntityType=NUMBER SentenceIndex=2 EntityMentionIndex=13 CanonicalEntityMentionIndex=13 NamedEntityTagProbs={NUMBER=-1.0}]",
         "[Text=10 million CharacterOffsetBegin=366 CharacterOffsetEnd=376 Tokens=[10-25, million-26] TokenBegin=71 TokenEnd=73 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.0E7 EntityType=NUMBER SentenceIndex=2 EntityMentionIndex=14 CanonicalEntityMentionIndex=14 NamedEntityTagProbs={NUMBER=-1.0}]",
-        "[Text=British CharacterOffsetBegin=377 CharacterOffsetEnd=384 Tokens=[British-27] TokenBegin=73 TokenEnd=74 NamedEntityTag=MISC EntityType=MISC SentenceIndex=2 EntityMentionIndex=15 CanonicalEntityMentionIndex=15 NamedEntityTagProbs={MISC=0.99999912098433}]",
+        "[Text=British CharacterOffsetBegin=377 CharacterOffsetEnd=384 Tokens=[British-27] TokenBegin=73 TokenEnd=74 NamedEntityTag=MISC EntityType=MISC SentenceIndex=2 EntityMentionIndex=15 CanonicalEntityMentionIndex=15 NamedEntityTagProbs={MISC=0.9999991248574666}]",
         "[Text=16.14 million CharacterOffsetBegin=393 CharacterOffsetEnd=406 Tokens=[16.14-30, million-31] TokenBegin=76 TokenEnd=78 NamedEntityTag=NUMBER NormalizedNamedEntityTag=1.614E7 EntityType=NUMBER SentenceIndex=2 EntityMentionIndex=16 CanonicalEntityMentionIndex=16 NamedEntityTagProbs={NUMBER=-1.0}]",
         "[Text=U.S. CharacterOffsetBegin=407 CharacterOffsetEnd=411 Tokens=[U.S.-32] TokenBegin=78 TokenEnd=79 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=2 EntityMentionIndex=17 CanonicalEntityMentionIndex=17 NamedEntityTagProbs={LOCATION=0.809576375280559}]"
     };
 
@@ -0,0 +1,44 @@
+package edu.stanford.nlp.pipeline;
+
+import edu.stanford.nlp.util.Pair;
+import junit.framework.TestCase;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+public class MWTProtobufSerializationITest extends TestCase {
+
+  public String sampleText = "Le but des bandes de roulement est d'augmenter la traction. Elle est présidente du conseil " +
+      "d'administration.";
+
+  public StanfordCoreNLP pipeline;
+
+  AnnotationSerializer serializer;
+
+  @Override
+  public void setUp() {
+    // set up pipeline and serializer
+    pipeline = new StanfordCoreNLP("french");
+    serializer = new ProtobufAnnotationSerializer();
+  }
+
+  public void testBasicExample() throws ClassNotFoundException, IOException {
+    // set up document
+    CoreDocument sampleDocument = new CoreDocument(sampleText);
+    // annotate
+    pipeline.annotate(sampleDocument);
+    // serialize
+    ByteArrayOutputStream ks = new ByteArrayOutputStream();
+    serializer.writeCoreDocument(sampleDocument, ks).close();
+    // Read
+    InputStream kis = new ByteArrayInputStream(ks.toByteArray());
+    Pair<Annotation, InputStream> pair = serializer.read(kis);
+    pair.second.close();
+    Annotation readAnnotation = pair.first;
+    kis.close();
+    ProtobufAnnotationSerializerSlowITest.sameAsRead(sampleDocument.annotation(), readAnnotation);
+  }
+
+}
@@ -386,7 +386,7 @@ public void testMultiParagraphQuoteSingle() {
         " 'I am the second paragraph.\n\n" +
         "'I am the second to last.\n\n" +
         "'see there's more here.'", quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
-    assertInnerAnnotationValues(quotes.get(0), 0, 0, 2, 3, 28);
+    assertInnerAnnotationValues(quotes.get(0), 0, 0, 3, 3, 28);
   }
 
   public void testMultiLineQuoteDouble() {
 
@@ -61,7 +61,7 @@ public void testTrueCaseAnnotator() {
     String text4 = "\"GOOD MORNING AMERICA FROM MCVEY!\"";
     String text5 = "\"good morning america from mcvey!\"";
     String text6 = "\"Good Morning America From McVey!\"";
-    String[] ans4 = { "``", "Good", "Morning", "America", "from", "McVey", "!", "''" };
+    String[] ans4 = { "\"", "Good", "Morning", "America", "from", "McVey", "!", "\"" };
 
     Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit, truecase");
     StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 
@@ -24,7 +24,7 @@ public class PTBTokenizerITest extends TestCase {
   private static void compareResults(BufferedReader testReader,
                              List<String> goldResults) {
     PTBTokenizer<CoreLabel> tokenizer =
-            new PTBTokenizer<>(testReader, new CoreLabelTokenFactory(), "");
+            new PTBTokenizer<>(testReader, new CoreLabelTokenFactory(), "ptb3Escaping=true");
     List<String> testResults = new ArrayList<>();
     while (tokenizer.hasNext()) {
       CoreLabel w = tokenizer.next();
 
@@ -85,13 +85,13 @@
     <dependency>
       <groupId>com.io7m.xom</groupId>
       <artifactId>xom</artifactId>
-      <version>1.2.10</version>
+      <version>1.3.2</version>
     </dependency>
 
     <dependency>
       <groupId>joda-time</groupId>
       <artifactId>joda-time</artifactId>
-      <version>2.9.4</version>
+      <version>2.10.5</version>
     </dependency>
 
     <dependency>
 
@@ -85,15 +85,15 @@
     </dependency>
 
     <dependency>
-      <groupId>com.io7m.xom</groupId>
+      <groupId>xom</groupId>
       <artifactId>xom</artifactId>
-      <version>1.2.10</version>
+      <version>1.3.2</version>
     </dependency>
 
     <dependency>
       <groupId>joda-time</groupId>
       <artifactId>joda-time</artifactId>
-      <version>2.9.4</version>
+      <version>2.10.5</version>
     </dependency>
 
     <dependency>
 
@@ -0,0 +1,63 @@
+# Convert five class sentiment data into text files of the type you
+# might use a sentence classifier for.
+
+# The default directories are what I set up on my local machine.
+# -i and -o change the input and output dirs.
+
+INPUT_DIR=extern_data/sentiment/sentiment-treebank
+OUTPUT_DIR=extern_data/sentiment/sst-processed
+
+while getopts "i:o:" OPTION
+do
+  case $OPTION in 
+  i)
+    INPUT_DIR=$OPTARG	    
+    ;;
+  o)
+    OUTPUT_DIR=$OPTARG	    
+    ;;
+  esac
+done
+
+  
+echo INPUT DIR: $INPUT_DIR
+echo OUTPUT DIR: $OUTPUT_DIR
+
+mkdir -p $OUTPUT_DIR/binary
+mkdir -p $OUTPUT_DIR/fiveclass
+
+echo $OUTPUT_DIR/fiveclass/train-phrases.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/train.txt > $OUTPUT_DIR/fiveclass/train-phrases.txt
+
+echo $OUTPUT_DIR/fiveclass/dev-phrases.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/dev.txt > $OUTPUT_DIR/fiveclass/dev-phrases.txt
+
+echo $OUTPUT_DIR/fiveclass/test-phrases.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/test.txt > $OUTPUT_DIR/fiveclass/test-phrases.txt
+
+
+echo $OUTPUT_DIR/fiveclass/train-roots.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/train.txt -root_only > $OUTPUT_DIR/fiveclass/train-roots.txt
+
+echo $OUTPUT_DIR/fiveclass/dev-roots.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/dev.txt  -root_only > $OUTPUT_DIR/fiveclass/dev-roots.txt
+
+echo $OUTPUT_DIR/fiveclass/test-roots.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/test.txt -root_only > $OUTPUT_DIR/fiveclass/test-roots.txt
+
+
+echo $OUTPUT_DIR/binary/train-binary-phrases.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/train.txt -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/train-binary-phrases.txt
+
+echo $OUTPUT_DIR/binary/dev-binary-phrases.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/dev.txt -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/dev-binary-phrases.txt
+
+echo $OUTPUT_DIR/binary/test-binary-phrases.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/test.txt -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/test-binary-phrases.txt
+
+echo $OUTPUT_DIR/binary/dev-binary-roots.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/dev.txt -root_only -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/dev-binary-roots.txt
+
+echo $OUTPUT_DIR/binary/test-binary-roots.txt
+java edu.stanford.nlp.trees.OutputSubtrees -input $INPUT_DIR/fiveclass/test.txt -root_only -ignore_labels 2 -remap_labels "1=0,2=-1,3=1,4=1" > $OUTPUT_DIR/binary/test-binary-roots.txt
+
@@ -52,6 +52,9 @@ public class Dataset<L, F> extends GeneralDataset<L, F> {
 
   final static Redwood.RedwoodChannels logger = Redwood.channels(Dataset.class);
 
+  /** we will multiply by this constant instead of divide by log(2) */
+  private static final double LN_TO_LOG2 = 1. / Math.log(2);
+
   public Dataset() {
     this(10);
   }
@@ -658,7 +661,7 @@ public double[] getInformationGains() {
     for (int i = 0; i < labelIndex.size(); i++) {
       double labelCount = labelCounter.getCount(labelIndex.get(i));
       double p = labelCount / size();
-      entropy -= p * (Math.log(p) / Math.log(2));
+      entropy -= p * Math.log(p) * LN_TO_LOG2;
     }
 
     double[] ig = new double[featureIndex.size()];
@@ -693,11 +696,11 @@ public double[] getInformationGains() {
         double pNot = notFeatureLabelCount / notFeatureCount;
 
         if (featureLabelCount != 0) {
-          sumFeature += p * (Math.log(p) / Math.log(2));
+          sumFeature += p * Math.log(p) * LN_TO_LOG2;
         }
 
         if (notFeatureLabelCount != 0) {
-          sumNotFeature += pNot * (Math.log(pNot) / Math.log(2));
+          sumNotFeature += pNot * Math.log(pNot) * LN_TO_LOG2;
         }
         //System.out.println(pNot+" "+(Math.log(pNot)/Math.log(2)));
 
 
@@ -16,6 +16,8 @@ public class LogPrior implements Serializable {
 
   public enum LogPriorType { NULL, QUADRATIC, HUBER, QUARTIC, COSH, ADAPT, MULTIPLE_QUADRATIC }
 
+  private static final double LOG2 = Math.log(2);
+  
   public static LogPriorType getType(String name) {
     if (name.equalsIgnoreCase("null")) { return LogPriorType.NULL; }
     else if (name.equalsIgnoreCase("quadratic")) { return LogPriorType.QUADRATIC; }
@@ -298,7 +300,7 @@ public double compute(double[] x, double[] grad) {
         double norm = ArrayMath.norm_1(x) / sigmaSq;
         double d;
         if (norm > 30.0) {
-          val = norm - Math.log(2);
+          val = norm - LOG2;
           d = 1.0 / sigmaSq;
         } else {
           val = Math.log(Math.cosh(norm));
 
@@ -1601,15 +1601,15 @@ protected void printFeatures(IN wi, Collection<String> features) {
   }
 
   /** Print the String features generated from a token. */
-  protected void printFeatureLists(IN wi, Collection<List<String>> features) {
+  protected void printFeatureLists(IN wi, Collection<Collection<String>> features) {
     if (flags.printFeatures == null || writtenNum >= flags.printFeaturesUpto) {
       return;
     }
     printFeatureListsHelper(wi, features);
   }
 
   // Separating this method out lets printFeatureLists be inlined, which is good since it is usually a no-op.
-  private void printFeatureListsHelper(IN wi, Collection<List<String>> features) {
+  private void printFeatureListsHelper(IN wi, Collection<Collection<String>> features) {
     if (cliqueWriter == null) {
       cliqueWriter = IOUtils.getPrintWriterOrDie("features-" + flags.printFeatures + ".txt");
       writtenNum = 0;
@@ -1622,7 +1622,7 @@ private void printFeatureListsHelper(IN wi, Collection<List<String>> features) {
           + wi.get(CoreAnnotations.GoldAnswerAnnotation.class) + '\t');
     }
     boolean first = true;
-    for (List<String> featList : features) {
+    for (Collection<String> featList : features) {
       List<String> sortedFeatList = new ArrayList<>(featList);
       Collections.sort(sortedFeatList);
       for (String feat : sortedFeatList) {
 
@@ -57,23 +57,19 @@ public CRFBiasedClassifier(Properties props) {
   public CRFBiasedClassifier(SeqClassifierFlags flags) {super(flags); }
 
   @Override
-  public CRFDatum<List<String>, CRFLabel> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) {
+  public CRFDatum<Collection<String>, CRFLabel> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) {
 
     pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
     PaddedList<IN> pInfo = new PaddedList<>(info, pad);
 
-    List<List<String>> features = new ArrayList<>();
-    Collection<Clique> done = Generics.newHashSet();
+    List<Collection<String>> features = new ArrayList<>();
     for (int i = 0; i < windowSize; i++) {
       List<String> featuresC = new ArrayList<>();
-      List<Clique> windowCliques = FeatureFactory.getCliques(i, 0);
-      windowCliques.removeAll(done);
-      done.addAll(windowCliques);
-      for (Clique c : windowCliques) {
+      FeatureFactory.eachClique(i, 0, c -> {
         for (FeatureFactory<IN> featureFactory : featureFactories) {
           featuresC.addAll(featureFactory.getCliqueFeatures(pInfo, loc, c));
         }
-      }
+      });
       if (testTime && i==0) {
         // this feature is only present at test time and only appears
         // in cliques of size 1 (i.e., cliques with window=0)
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ public static void runOnceBeforeClass() {`
`28`	`28`	`System.err.println("Setting up pipeline in @BeforeClasss");`
`29`	`29`	`}`
`30`	`30`	`pipeline = new AnnotationPipeline();`
`31`		`- pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));`
	`31`	`+ pipeline.addAnnotator(new TokenizerAnnotator(false, "en", "invertible,splitHyphenated=false"));`
`32`	`32`	`pipeline.addAnnotator(new WordsToSentencesAnnotator(false));`
`33`	`33`	`pipeline.addAnnotator(new POSTaggerAnnotator(DefaultPaths.DEFAULT_POS_MODEL, false));`
`34`	`34`	`}`
Original file line number	Diff line number	Diff line change
`@@ -386,7 +386,7 @@ public void testMultiParagraphQuoteSingle() {`
`386`	`386`	`" 'I am the second paragraph.\n\n" +`
`387`	`387`	`"'I am the second to last.\n\n" +`
`388`	`388`	`"'see there's more here.'", quotes.get(0).get(CoreAnnotations.TextAnnotation.class));`
`389`		`- assertInnerAnnotationValues(quotes.get(0), 0, 0, 2, 3, 28);`
	`389`	`+ assertInnerAnnotationValues(quotes.get(0), 0, 0, 3, 3, 28);`
`390`	`390`	`}`
`391`	`391`
`392`	`392`	`public void testMultiLineQuoteDouble() {`
Original file line number	Diff line number	Diff line change
`@@ -1601,15 +1601,15 @@ protected void printFeatures(IN wi, Collection<String> features) {`
`1601`	`1601`	`}`
`1602`	`1602`
`1603`	`1603`	`/** Print the String features generated from a token. */`
`1604`		`- protected void printFeatureLists(IN wi, Collection<List<String>> features) {`
	`1604`	`+ protected void printFeatureLists(IN wi, Collection<Collection<String>> features) {`
`1605`	`1605`	`if (flags.printFeatures == null \|\| writtenNum >= flags.printFeaturesUpto) {`
`1606`	`1606`	`return;`
`1607`	`1607`	`}`
`1608`	`1608`	`printFeatureListsHelper(wi, features);`
`1609`	`1609`	`}`
`1610`	`1610`
`1611`	`1611`	`// Separating this method out lets printFeatureLists be inlined, which is good since it is usually a no-op.`
`1612`		`- private void printFeatureListsHelper(IN wi, Collection<List<String>> features) {`
	`1612`	`+ private void printFeatureListsHelper(IN wi, Collection<Collection<String>> features) {`
`1613`	`1613`	`if (cliqueWriter == null) {`
`1614`	`1614`	`cliqueWriter = IOUtils.getPrintWriterOrDie("features-" + flags.printFeatures + ".txt");`
`1615`	`1615`	`writtenNum = 0;`
`@@ -1622,7 +1622,7 @@ private void printFeatureListsHelper(IN wi, Collection<List<String>> features) {`
`1622`	`1622`	`+ wi.get(CoreAnnotations.GoldAnswerAnnotation.class) + '\t');`
`1623`	`1623`	`}`
`1624`	`1624`	`boolean first = true;`
`1625`		`- for (List<String> featList : features) {`
	`1625`	`+ for (Collection<String> featList : features) {`
`1626`	`1626`	`List<String> sortedFeatList = new ArrayList<>(featList);`
`1627`	`1627`	`Collections.sort(sortedFeatList);`
`1628`	`1628`	`for (String feat : sortedFeatList) {`