Merge branch 'release-0.05'

HazyResearch · Feb 9, 2015 · 4fdfe91 · 4fdfe91
2 parents b778553 + 307ab86
commit 4fdfe91
Show file tree

Hide file tree

Showing 30 changed files with 650 additions and 108 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 FROM ubuntu
-MAINTAINER Adam Goldberg <[email protected]>
+MAINTAINER [email protected]
 RUN sudo apt-get update
 RUN sudo apt-get -y install emacs
 RUN sudo apt-get -y install default-jre default-jdk
@@ -8,16 +8,41 @@ RUN sudo apt-get -y install gnuplot
 RUN sudo apt-get -y install postgresql postgresql-contrib
 RUN sudo apt-get -y install git
 RUN sudo apt-get -y install build-essential
+RUN sudo apt-get -y install libnuma-dev
+RUN sudo apt-get -y install bc
 RUN cd ~/ && git clone https://github.com/HazyResearch/deepdive.git
 RUN sudo apt-get install zip unzip
 RUN cd ~/deepdive && make
 
 # Configure environment variables
-RUN echo 'export PGUSER=postgres' >> ~/.bashrc
-RUN echo 'export PGPORT=$POSTGRES_PORT_5432_TCP_PORT' >> ~/.bashrc
-RUN echo 'export PGHOST=$POSTGRES_PORT_5432_TCP_ADDR' >> ~/.bashrc
-RUN echo 'export PGPASSWORD=password' >> ~/.bashrc
-RUN echo 'export PGUSER=postgres' >> ~/.bashrc
+RUN echo 'export PGPORT=$DB_PORT_5432_TCP_PORT' >> ~/.bashrc
+RUN echo 'export PGHOST=$DB_PORT_5432_TCP_ADDR' >> ~/.bashrc
+RUN echo 'export PGPASSWORD=' >> ~/.bashrc
+RUN echo 'export PGUSER=gpadmin' >> ~/.bashrc
 RUN echo 'export DEEPDIVE_HOME=~/deepdive' >> ~/.bashrc
-RUN echo 'export LD_LIBRARY_PATH=$DEEPDIVE_HOME/lib/dw_linux/lib:$DEEPDIVE_HOME/lib/dw_linux/lib64' >> ~/.bashrc
-RUN echo 'export PATH=~/deepdive/sbt:$PATH' >> ~/.bashrc
+RUN echo 'export LD_LIBRARY_PATH=$DEEPDIVE_HOME/lib/dw_linux/lib:$DEEPDIVE_HOME/sbt:$DEEPDIVE_HOME/lib/dw_linux/lib64' >> ~/.bashrc
+RUN echo 'export PATH=~/deepdive/sbt:$PATH' >> ~/.bashrc
+
+# Initialize script to wait for greenplum
+RUN echo 'while true; do' >> ~/.bashrc
+RUN echo '  psql -q -h $DB_PORT_5432_TCP_ADDR -p $DB_PORT_5432_TCP_PORT -U gpadmin deepdive -c "SELECT 1;" > /dev/null 2 >& 1' >> ~/.bashrc
+RUN echo '  RETVAL=$?' >> ~/.bashrc
+RUN echo '  [ $RETVAL -eq 0 ] && break' >> ~/.bashrc
+RUN echo '  echo -ne "Waiting for DB\r"' >> ~/.bashrc
+RUN echo '  sleep 1' >> ~/.bashrc
+RUN echo '  echo -ne "Waiting for DB.\r"' >> ~/.bashrc
+RUN echo '  sleep 1' >> ~/.bashrc
+RUN echo '  echo -ne "Waiting for DB..\r"' >> ~/.bashrc
+RUN echo '  sleep 1' >> ~/.bashrc
+RUN echo '  echo -ne "Waiting for DB...\r"' >> ~/.bashrc
+RUN echo '  sleep 1' >> ~/.bashrc
+RUN echo '  echo -ne "Waiting for DB....\r"' >> ~/.bashrc
+RUN echo '  sleep 1' >> ~/.bashrc
+RUN echo 'done' >> ~/.bashrc
+RUN echo 'echo -ne "\nGreenplum is up and running! You may now use deepdive.\n"' >> ~/.bashrc
+
+RUN sed -i s/'sbt "test-only org.deepdive.test.integration.ChunkingApp -- -oF"'/'echo "Skipping ChunkingApp" \#sbt "test-only org.deepdive.test.integration.ChunkingApp -- -oF"'/g /root/deepdive/test/test_psql.sh
+
+RUN mkdir -p ~/deepdive/app
+
+VOLUME ["/root/deepdive/app"]
diff --git a/README.md b/README.md
@@ -1,8 +1,38 @@
-# DeepDive
+# DeepDive v0.05
 
 Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0.txt
 
-Tested with Travis CI. 
+Tested with Travis CI.
 [![Build Status](https://travis-ci.org/HazyResearch/deepdive.svg?branch=master)](https://travis-ci.org/HazyResearch/deepdive)
 
 ### [Visit The DeepDive Website](http://deepdive.stanford.edu)
+
+Docker instructions:
+<pre>
+# Build and tag deepdive image
+# You may specify 'latest' or 'dev' as the tag
+docker pull adamwgoldberg/deepdive-github:develop
+
+# Pull my greenplum image from Docker Hub. Contact me if you need access to the private repository on Docker Hub.
+docker run -d --privileged --name db -h gphost adamwgoldberg/greenplum
+
+# Run Deepdive
+# All deepdive application code should be created in /root/deepdive/app
+# Make sure the deepdive-github tag matches the above one.
+docker run -t -d --link db:db --name deepdive adamwgoldberg/deepdive-github:develop bash
+
+# Attach shell to Deepdive
+# You may need to wait several minutes for Greenplum to initialize.
+# The bash shell will say "Waiting for DB..." until it finishes.
+docker exec -ti deepdive bash
+
+# Inside of that shell run:
+cd ~/deepdive
+make test
+</pre>
+
+Docker tips:
+* AWS EC2 m.xlarge on Virginia region using ami-84e897ec is a great place to start
+* Ensure you have at least 20GB of storage
+* Any machine with Docker installed should work fine
+* Due to licensing, Greenplum is not freely available outside of our lab. You may wish to use a Dockerized postgres instead.
diff --git a/doc/doc/advanced/reserved_tables.md b/doc/doc/advanced/reserved_tables.md
@@ -18,7 +18,10 @@ with the same name in the database:
      public | dd_inference_result_variables                | table    
      public | dd_inference_result_weights                  | table    
      public | dd_inference_result_weights_mapping          | view     
+     public | dd_feature_statistics_support                | table
+     public | dd_feature_statistics                        | view
      public | dd_query_[RULE_NAME]                         | table
+     public | dd_weights_[RULE_NAME]                       | table
      public | [TABLE]_[VARIABLE]_inference                 | view
      public | [TABLE]_[VARIABLE]_inference_bucketed        | view
      public | [TABLE]_[VARIABLE]_calibration               | view
@@ -44,7 +47,16 @@ Description of each schema:
 
 - `dd_inference_result_weights_mapping`: a view that maps all distinct factor weights to their description and  their learned values. It is a commonly used view that shows the learned weight value of a factor as well as the number of occurences of a factor.
 
-- `dd_query_[RULE_NAME]`: a view that is defined by the input query of an [inference rule](../basics/inference_rules.html).
+- `dd_feature_statistics_support`: a table that maps all distinct feature descriptions with number of positive, negative examples and query variables. 
+    - e.g. feature "word_seq=[is-married-to]" is associated with 1000 positive examples, 10 negative examples, and has 3000 query variables.
+    - The statistics are only for unary factors (with factor function `IsTrue`).
+
+- `dd_feature_statistics`: a view that joins `dd_inference_result_weights_mapping` and `dd_feature_statistics_support`. It gathers all distinct feature descriptions, weights, and number of positive, negative examples and query variables. 
+    - Non-unary factors will have NULL values in the statistics columns.
+
+- `dd_query_[RULE_NAME]`: a table that is defined by the input query of an [inference rule](../basics/inference_rules.html). You can use it as a feature table in BrainDump.
+
+- `dd_weight_[RULE_NAME]`: a table that stores initial weights for factors, used internally.
 
 - `[TABLE]_[VARIABLE]_inference`: a view that maps variables with their inference results. It is commonly used for error analysis.
 

diff --git a/doc/doc/basics/calibration.md b/doc/doc/basics/calibration.md
@@ -49,24 +49,54 @@ each variable, and the learned factor weights. DeepDive creates a view called
 learned values sorted by absolute value. The
 `dd_inference_result_weights_mapping` view has the following schema:
 
-          View "public.dd_inference_result_weights_mapping"
-        Column     |       Type       | Modifiers | Storage  | Description
-    ---------------+------------------+-----------+----------+-------------
-     id            | bigint           |           | plain    |
-     initial_value | double precision |           | plain    |
-     is_fixed      | boolean          |           | plain    |
-     description   | text             |           | extended |
-     weight        | double precision |           | plain    |
+    View "public.dd_inference_result_weights_mapping"
+       Column    |       Type       | Modifiers
+    -------------+------------------+-----------
+     id          | bigint           |
+     isfixed     | integer          |
+     initvalue   | real             |
+     cardinality | text             |
+     description | text             |
+     weight      | double precision |
 
 
 Specification for these fields:
 
 - **id**: the unique identifier for the weight
 - **initial_value**: the initial value for the weight
 - **is_fixed**: whether the weight is fixed (cannot be changed during learning)
+- **cardinality**: the cardinality of this factor. Meaningful for [multinomial factors](chunking.html).
 - **description**: description of the weight, composed by [the name of inference rule]-[the specified value of "weight" in inference rule]
 - **weight**: the learned weight value
 
+DeepDive also creates a view `dd_feature_statistics` that maps factor names, weights and number of positive / negative examples associated with this factor:
+
+         View "public.dd_feature_statistics"
+        Column    |       Type       | Modifiers
+    --------------+------------------+-----------
+     id           | bigint           |
+     isfixed      | integer          |
+     initvalue    | real             |
+     cardinality  | text             |
+     description  | text             |
+     weight       | double precision |
+     pos_examples | bigint           |
+     neg_examples | bigint           |
+     queries      | bigint           |
+
+It has all columns from `dd_inference_result_weights_mapping`, and three additional columns:
+
+- **pos_examples**: The number of positive examples associated with this feature.
+- **neg_examples**: The number of negative examples associated with this feature.
+- **queries**: The number of queries associated with this feature.
+
+Note these columns contain non-NULL values only when the factor is a unary `IsTrue` factor.
+
+This table can be used to diagnose features, e.g. if a feature gets
+too high weight, it might because there are not enough negative
+examples for this feature, and you may need to add more data or more 
+distant supervision rules.
+
 ### Calibration data and plots
 
 The system generates a calibration data file for each variable defined in the

diff --git a/doc/doc/basics/chunking.md b/doc/doc/basics/chunking.md
@@ -6,7 +6,7 @@ layout: default
 
 ## Introduction
 
-In this document, we will describe an example application of text chunking using DeepDive. This example assumes a working installation of DeepDive, and basic knowledge of how to build an application in DeepDive. Please go through the [example application walkthrough](walkthrough/walkthrough.html) before preceding.
+In this document, we will describe an example application of text chunking using DeepDive, and demonstrate how to use **Multinomial variables**. This example assumes a working installation of DeepDive, and basic knowledge of how to build an application in DeepDive. Please go through the [example application walkthrough](walkthrough/walkthrough.html) before preceding.
 
 Text chunking consists of dividing a text in syntactically correlated parts of words. For example, the sentence He reckons the current account deficit will narrow to only # 1.8 billion in September . can be divided as follows:
 

diff --git a/doc/doc/changelog/0.05.01-alpha.md b/doc/doc/changelog/0.05.01-alpha.md
@@ -0,0 +1,13 @@
+---
+layout: default
+---
+
+# Changelog for release 0.0.5-alpha (02/08/2015)
+
+- Added support to build Docker images for DeepDive.  See the README.md for more.
+- Added SQL "FeatureStatsView" view.  Populated with feature
+  statistics; useful for debugging.
+- Added a few fixes to greenplum docs
+- Added parallel greenplum loading for extractor data
+- A few misc bugfixes
+
diff --git a/examples/nlp_extractor/src/main/scala/DocumentParser.scala b/examples/nlp_extractor/src/main/scala/DocumentParser.scala
@@ -10,11 +10,12 @@ import edu.stanford.nlp.pipeline._
 import edu.stanford.nlp.util._
 import edu.stanford.nlp.ling.CoreAnnotations._
 import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation
-import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation
 import scala.collection.JavaConversions._
 import java.io.{StringReader, StringWriter, PrintWriter}
 import java.util.Properties
 
+// Future usage: coreference resolution
+// import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation
 
 class DocumentParser(props: Properties) {
 
@@ -24,7 +25,8 @@ class DocumentParser(props: Properties) {
 
     val document = new Annotation(doc)
     pipeline.annotate(document)
-    val dcoref = document.get(classOf[CorefChainAnnotation])
+    // Future usage: coreference resolution
+    // val dcoref = document.get(classOf[CorefChainAnnotation])
     val sentences = document.get(classOf[SentencesAnnotation])
 
     val sentenceResults = sentences.zipWithIndex.map { case(sentence, sentIdx) =>

diff --git a/examples/nlp_extractor/src/main/scala/Main.scala b/examples/nlp_extractor/src/main/scala/Main.scala
@@ -35,7 +35,7 @@ object Main extends App {
 
   // Configuration has been parsed, execute the Document parser
   val props = new Properties()
-  props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, dcoref")
+  props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse")
   props.put("parse.maxlen", conf.maxSentenceLength)
   props.put("threads", conf.numThreads)
   val dp = new DocumentParser(props)

diff --git a/examples/nlp_extractor_tsv/.gitignore b/examples/nlp_extractor_tsv/.gitignore
@@ -0,0 +1 @@
+target/
diff --git a/examples/nlp_extractor_tsv/README.md b/examples/nlp_extractor_tsv/README.md
@@ -0,0 +1,93 @@
+# NLP Extractor
+
+This directory provides an NLP extractor that is a wrapper for
+Stanford NLP. Its input is textual data in TSV format, and its output
+is one or many  TSV files that contains processed sentences data that
+can be directly loaded into PostgreSQL / Greenplum tables.
+
+In (1) we show how to use it this NLP extractor as an "extractor" in
+DeepDive, and in (2) we provide a script to run it as a stand-alone
+application in parallel, without DeepDive --- which is a recommended
+way if you have lots of data, since this way can achieve more
+parallism.
+
+## Compile
+
+Compile the NLP extractor using following command in this directory:
+
+    sbt stage
+
+## (1) run.sh and Integration with DeepDive
+
+### Input
+
+The input file is a TSV file of following form. Each line is a
+document.
+
+The TSV file should have two columns. The first column is
+`document_id` (text format), a unique modifier for a document. The
+second column is `text`, all sentences in the document in plain text:
+
+    doc1\tFull text in doc1
+    doc2\tFull text in doc2
+    ...
+
+Note that the input TSV file should not have headers.
+
+### Output
+
+Output is another TSV file that contains multiple columns:
+
+1. `document_id`: The document_id from input.
+2. `sentence`: The raw sentence text same as input
+3. `words`: (PSQL-formatted) array of words
+4. `lemma`: array of lemmatized words
+5. `post_tags`: array of Part-of-speech tags
+6. `ner_tags`: array of Named Entity tags
+7. `dependencies`: array of collapsed dependencies
+8. `sentence_offset`: The index / offset of this sentence in document
+9. `sentence_id`: A unique identifier to the sentence
+
+You can create a table like this to be able to import the output TSV
+file to the database. (Note that this is the `output_relation` in
+DeepDive)
+
+    CREATE TABLE sentences(
+      document_id bigint,
+      sentence text, 
+      words text[],
+      lemma text[],
+      pos_tags text[],
+      dependencies text[],
+      ner_tags text[],
+      sentence_offset bigint,
+      sentence_id text
+      );
+
+## (2) run_parallel.sh: Stand-alone Parallel NLP Extractor
+
+When used the `run.sh` with DeepDive, sometime ideal parallelism
+cannot be achieved because of memory problems. In this case, we
+recommend to use the `run_parallel.sh`. It does the following steps:
+
+1. Split your input file `INPUT_FILE` into chunks in `INPUT_FILE.split/`
+2. Uses system parallelism tool `xargs` to run `run.sh` in parallel. 
+   The outputs are saved to `INPUT_FILE.split/*.out`.
+
+Run it with the following command
+
+    ./run_parallel.sh INPUT_FILE PARALLELISM [INPUT_BATCH_SIZE=100] [SENTENCE_WORDS_LIMIT=120]
+
+- `INPUT_FILE`: your input TSV file
+- `PARALLELISM`: a number indicating desired parallelism. e.g.: 8
+- `INPUT_BATCH_SIZE`: how many lines are in each file after split.
+  Default 100.
+- `SENTENCE_WORDS_LIMIT`: Do not run dependency parsing if number of
+  words in sentence is larger than this number. This helps in speeding
+  up the parsing.
+
+When finished, you should manually import the files in 
+`INPUT_FILE.split/*.out` into your database. You can use a COPY query
+like this:
+
+    cat INPUT_FILE.split/*.out | psql YOUR_DB_NAME -c "COPY sentences FROM STDIN"
diff --git a/examples/nlp_extractor_tsv/build.sbt b/examples/nlp_extractor_tsv/build.sbt
@@ -0,0 +1,22 @@
+import com.typesafe.sbt.SbtStartScript
+
+name := "deepdive-nlp-parser"
+
+version := "0.1"
+
+scalaVersion := "2.10.3"
+
+resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
+
+libraryDependencies ++= List(
+  "ch.qos.logback" % "logback-classic" % "1.0.7",
+  "com.typesafe.play" %% "play-json" % "2.2.1",
+  "com.github.scopt" %% "scopt" % "3.2.0",
+  "edu.stanford.nlp" % "stanford-corenlp" % "3.3.1",
+  "edu.stanford.nlp" % "stanford-corenlp" % "3.3.1" classifier "models",
+  "org.scalatest" % "scalatest_2.10" % "2.0.RC2" % "test"
+)
+
+parallelExecution in Test := false
+
+seq(SbtStartScript.startScriptForClassesSettings: _*)
diff --git a/examples/nlp_extractor_tsv/project/plugins.sbt b/examples/nlp_extractor_tsv/project/plugins.sbt
@@ -0,0 +1 @@
+addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0")
diff --git a/examples/nlp_extractor_tsv/run.sh b/examples/nlp_extractor_tsv/run.sh
@@ -0,0 +1,5 @@
+#! /usr/bin/env bash
+
+# export SBT_OPTS="-Xmx1g"
+
+$(dirname $0)/target/start $@