Skip to content

Commit

Permalink
Merge branch 'release-0.05'
Browse files Browse the repository at this point in the history
  • Loading branch information
mikecafarella committed Feb 9, 2015
2 parents b778553 + 307ab86 commit 4fdfe91
Show file tree
Hide file tree
Showing 30 changed files with 650 additions and 108 deletions.
41 changes: 33 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM ubuntu
MAINTAINER Adam Goldberg <[email protected]>
MAINTAINER [email protected]
RUN sudo apt-get update
RUN sudo apt-get -y install emacs
RUN sudo apt-get -y install default-jre default-jdk
Expand All @@ -8,16 +8,41 @@ RUN sudo apt-get -y install gnuplot
RUN sudo apt-get -y install postgresql postgresql-contrib
RUN sudo apt-get -y install git
RUN sudo apt-get -y install build-essential
RUN sudo apt-get -y install libnuma-dev
RUN sudo apt-get -y install bc
RUN cd ~/ && git clone https://github.com/HazyResearch/deepdive.git
RUN sudo apt-get install zip unzip
RUN cd ~/deepdive && make

# Configure environment variables
RUN echo 'export PGUSER=postgres' >> ~/.bashrc
RUN echo 'export PGPORT=$POSTGRES_PORT_5432_TCP_PORT' >> ~/.bashrc
RUN echo 'export PGHOST=$POSTGRES_PORT_5432_TCP_ADDR' >> ~/.bashrc
RUN echo 'export PGPASSWORD=password' >> ~/.bashrc
RUN echo 'export PGUSER=postgres' >> ~/.bashrc
RUN echo 'export PGPORT=$DB_PORT_5432_TCP_PORT' >> ~/.bashrc
RUN echo 'export PGHOST=$DB_PORT_5432_TCP_ADDR' >> ~/.bashrc
RUN echo 'export PGPASSWORD=' >> ~/.bashrc
RUN echo 'export PGUSER=gpadmin' >> ~/.bashrc
RUN echo 'export DEEPDIVE_HOME=~/deepdive' >> ~/.bashrc
RUN echo 'export LD_LIBRARY_PATH=$DEEPDIVE_HOME/lib/dw_linux/lib:$DEEPDIVE_HOME/lib/dw_linux/lib64' >> ~/.bashrc
RUN echo 'export PATH=~/deepdive/sbt:$PATH' >> ~/.bashrc
RUN echo 'export LD_LIBRARY_PATH=$DEEPDIVE_HOME/lib/dw_linux/lib:$DEEPDIVE_HOME/sbt:$DEEPDIVE_HOME/lib/dw_linux/lib64' >> ~/.bashrc
RUN echo 'export PATH=~/deepdive/sbt:$PATH' >> ~/.bashrc

# Initialize script to wait for greenplum
RUN echo 'while true; do' >> ~/.bashrc
RUN echo ' psql -q -h $DB_PORT_5432_TCP_ADDR -p $DB_PORT_5432_TCP_PORT -U gpadmin deepdive -c "SELECT 1;" > /dev/null 2 >& 1' >> ~/.bashrc
RUN echo ' RETVAL=$?' >> ~/.bashrc
RUN echo ' [ $RETVAL -eq 0 ] && break' >> ~/.bashrc
RUN echo ' echo -ne "Waiting for DB\r"' >> ~/.bashrc
RUN echo ' sleep 1' >> ~/.bashrc
RUN echo ' echo -ne "Waiting for DB.\r"' >> ~/.bashrc
RUN echo ' sleep 1' >> ~/.bashrc
RUN echo ' echo -ne "Waiting for DB..\r"' >> ~/.bashrc
RUN echo ' sleep 1' >> ~/.bashrc
RUN echo ' echo -ne "Waiting for DB...\r"' >> ~/.bashrc
RUN echo ' sleep 1' >> ~/.bashrc
RUN echo ' echo -ne "Waiting for DB....\r"' >> ~/.bashrc
RUN echo ' sleep 1' >> ~/.bashrc
RUN echo 'done' >> ~/.bashrc
RUN echo 'echo -ne "\nGreenplum is up and running! You may now use deepdive.\n"' >> ~/.bashrc

RUN sed -i s/'sbt "test-only org.deepdive.test.integration.ChunkingApp -- -oF"'/'echo "Skipping ChunkingApp" \#sbt "test-only org.deepdive.test.integration.ChunkingApp -- -oF"'/g /root/deepdive/test/test_psql.sh

RUN mkdir -p ~/deepdive/app

VOLUME ["/root/deepdive/app"]
34 changes: 32 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,38 @@
# DeepDive
# DeepDive v0.05

Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0.txt

Tested with Travis CI.
Tested with Travis CI.
[![Build Status](https://travis-ci.org/HazyResearch/deepdive.svg?branch=master)](https://travis-ci.org/HazyResearch/deepdive)

### [Visit The DeepDive Website](http://deepdive.stanford.edu)

Docker instructions:
<pre>
# Build and tag deepdive image
# You may specify 'latest' or 'dev' as the tag
docker pull adamwgoldberg/deepdive-github:develop

# Pull my greenplum image from Docker Hub. Contact me if you need access to the private repository on Docker Hub.
docker run -d --privileged --name db -h gphost adamwgoldberg/greenplum

# Run Deepdive
# All deepdive application code should be created in /root/deepdive/app
# Make sure the deepdive-github tag matches the above one.
docker run -t -d --link db:db --name deepdive adamwgoldberg/deepdive-github:develop bash

# Attach shell to Deepdive
# You may need to wait several minutes for Greenplum to initialize.
# The bash shell will say "Waiting for DB..." until it finishes.
docker exec -ti deepdive bash

# Inside of that shell run:
cd ~/deepdive
make test
</pre>

Docker tips:
* AWS EC2 m.xlarge on Virginia region using ami-84e897ec is a great place to start
* Ensure you have at least 20GB of storage
* Any machine with Docker installed should work fine
* Due to licensing, Greenplum is not freely available outside of our lab. You may wish to use a Dockerized postgres instead.
14 changes: 13 additions & 1 deletion doc/doc/advanced/reserved_tables.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ with the same name in the database:
public | dd_inference_result_variables | table
public | dd_inference_result_weights | table
public | dd_inference_result_weights_mapping | view
public | dd_feature_statistics_support | table
public | dd_feature_statistics | view
public | dd_query_[RULE_NAME] | table
public | dd_weights_[RULE_NAME] | table
public | [TABLE]_[VARIABLE]_inference | view
public | [TABLE]_[VARIABLE]_inference_bucketed | view
public | [TABLE]_[VARIABLE]_calibration | view
Expand All @@ -44,7 +47,16 @@ Description of each schema:

- `dd_inference_result_weights_mapping`: a view that maps all distinct factor weights to their description and their learned values. It is a commonly used view that shows the learned weight value of a factor as well as the number of occurences of a factor.

- `dd_query_[RULE_NAME]`: a view that is defined by the input query of an [inference rule](../basics/inference_rules.html).
- `dd_feature_statistics_support`: a table that maps all distinct feature descriptions with number of positive, negative examples and query variables.
- e.g. feature "word_seq=[is-married-to]" is associated with 1000 positive examples, 10 negative examples, and has 3000 query variables.
- The statistics are only for unary factors (with factor function `IsTrue`).

- `dd_feature_statistics`: a view that joins `dd_inference_result_weights_mapping` and `dd_feature_statistics_support`. It gathers all distinct feature descriptions, weights, and number of positive, negative examples and query variables.
- Non-unary factors will have NULL values in the statistics columns.

- `dd_query_[RULE_NAME]`: a table that is defined by the input query of an [inference rule](../basics/inference_rules.html). You can use it as a feature table in BrainDump.

- `dd_weight_[RULE_NAME]`: a table that stores initial weights for factors, used internally.

- `[TABLE]_[VARIABLE]_inference`: a view that maps variables with their inference results. It is commonly used for error analysis.

Expand Down
46 changes: 38 additions & 8 deletions doc/doc/basics/calibration.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,54 @@ each variable, and the learned factor weights. DeepDive creates a view called
learned values sorted by absolute value. The
`dd_inference_result_weights_mapping` view has the following schema:

View "public.dd_inference_result_weights_mapping"
Column | Type | Modifiers | Storage | Description
---------------+------------------+-----------+----------+-------------
id | bigint | | plain |
initial_value | double precision | | plain |
is_fixed | boolean | | plain |
description | text | | extended |
weight | double precision | | plain |
View "public.dd_inference_result_weights_mapping"
Column | Type | Modifiers
-------------+------------------+-----------
id | bigint |
isfixed | integer |
initvalue | real |
cardinality | text |
description | text |
weight | double precision |


Specification for these fields:

- **id**: the unique identifier for the weight
- **initial_value**: the initial value for the weight
- **is_fixed**: whether the weight is fixed (cannot be changed during learning)
- **cardinality**: the cardinality of this factor. Meaningful for [multinomial factors](chunking.html).
- **description**: description of the weight, composed by [the name of inference rule]-[the specified value of "weight" in inference rule]
- **weight**: the learned weight value

DeepDive also creates a view `dd_feature_statistics` that maps factor names, weights and number of positive / negative examples associated with this factor:

View "public.dd_feature_statistics"
Column | Type | Modifiers
--------------+------------------+-----------
id | bigint |
isfixed | integer |
initvalue | real |
cardinality | text |
description | text |
weight | double precision |
pos_examples | bigint |
neg_examples | bigint |
queries | bigint |

It has all columns from `dd_inference_result_weights_mapping`, and three additional columns:

- **pos_examples**: The number of positive examples associated with this feature.
- **neg_examples**: The number of negative examples associated with this feature.
- **queries**: The number of queries associated with this feature.

Note these columns contain non-NULL values only when the factor is a unary `IsTrue` factor.

This table can be used to diagnose features, e.g. if a feature gets
too high weight, it might because there are not enough negative
examples for this feature, and you may need to add more data or more
distant supervision rules.

### Calibration data and plots

The system generates a calibration data file for each variable defined in the
Expand Down
2 changes: 1 addition & 1 deletion doc/doc/basics/chunking.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ layout: default

## Introduction

In this document, we will describe an example application of text chunking using DeepDive. This example assumes a working installation of DeepDive, and basic knowledge of how to build an application in DeepDive. Please go through the [example application walkthrough](walkthrough/walkthrough.html) before preceding.
In this document, we will describe an example application of text chunking using DeepDive, and demonstrate how to use **Multinomial variables**. This example assumes a working installation of DeepDive, and basic knowledge of how to build an application in DeepDive. Please go through the [example application walkthrough](walkthrough/walkthrough.html) before preceding.

Text chunking consists of dividing a text in syntactically correlated parts of words. For example, the sentence He reckons the current account deficit will narrow to only # 1.8 billion in September . can be divided as follows:

Expand Down
13 changes: 13 additions & 0 deletions doc/doc/changelog/0.05.01-alpha.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
layout: default
---

# Changelog for release 0.0.5-alpha (02/08/2015)

- Added support to build Docker images for DeepDive. See the README.md for more.
- Added SQL "FeatureStatsView" view. Populated with feature
statistics; useful for debugging.
- Added a few fixes to greenplum docs
- Added parallel greenplum loading for extractor data
- A few misc bugfixes

6 changes: 4 additions & 2 deletions examples/nlp_extractor/src/main/scala/DocumentParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ import edu.stanford.nlp.pipeline._
import edu.stanford.nlp.util._
import edu.stanford.nlp.ling.CoreAnnotations._
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation
import scala.collection.JavaConversions._
import java.io.{StringReader, StringWriter, PrintWriter}
import java.util.Properties

// Future usage: coreference resolution
// import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation

class DocumentParser(props: Properties) {

Expand All @@ -24,7 +25,8 @@ class DocumentParser(props: Properties) {

val document = new Annotation(doc)
pipeline.annotate(document)
val dcoref = document.get(classOf[CorefChainAnnotation])
// Future usage: coreference resolution
// val dcoref = document.get(classOf[CorefChainAnnotation])
val sentences = document.get(classOf[SentencesAnnotation])

val sentenceResults = sentences.zipWithIndex.map { case(sentence, sentIdx) =>
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp_extractor/src/main/scala/Main.scala
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ object Main extends App {

// Configuration has been parsed, execute the Document parser
val props = new Properties()
props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse, dcoref")
props.put("annotators", "tokenize, cleanxml, ssplit, pos, lemma, ner, parse")
props.put("parse.maxlen", conf.maxSentenceLength)
props.put("threads", conf.numThreads)
val dp = new DocumentParser(props)
Expand Down
1 change: 1 addition & 0 deletions examples/nlp_extractor_tsv/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
target/
93 changes: 93 additions & 0 deletions examples/nlp_extractor_tsv/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# NLP Extractor

This directory provides an NLP extractor that is a wrapper for
Stanford NLP. Its input is textual data in TSV format, and its output
is one or many TSV files that contains processed sentences data that
can be directly loaded into PostgreSQL / Greenplum tables.

In (1) we show how to use it this NLP extractor as an "extractor" in
DeepDive, and in (2) we provide a script to run it as a stand-alone
application in parallel, without DeepDive --- which is a recommended
way if you have lots of data, since this way can achieve more
parallism.

## Compile

Compile the NLP extractor using following command in this directory:

sbt stage

## (1) run.sh and Integration with DeepDive

### Input

The input file is a TSV file of following form. Each line is a
document.

The TSV file should have two columns. The first column is
`document_id` (text format), a unique modifier for a document. The
second column is `text`, all sentences in the document in plain text:

doc1\tFull text in doc1
doc2\tFull text in doc2
...

Note that the input TSV file should not have headers.

### Output

Output is another TSV file that contains multiple columns:

1. `document_id`: The document_id from input.
2. `sentence`: The raw sentence text same as input
3. `words`: (PSQL-formatted) array of words
4. `lemma`: array of lemmatized words
5. `post_tags`: array of Part-of-speech tags
6. `ner_tags`: array of Named Entity tags
7. `dependencies`: array of collapsed dependencies
8. `sentence_offset`: The index / offset of this sentence in document
9. `sentence_id`: A unique identifier to the sentence

You can create a table like this to be able to import the output TSV
file to the database. (Note that this is the `output_relation` in
DeepDive)

CREATE TABLE sentences(
document_id bigint,
sentence text,
words text[],
lemma text[],
pos_tags text[],
dependencies text[],
ner_tags text[],
sentence_offset bigint,
sentence_id text
);

## (2) run_parallel.sh: Stand-alone Parallel NLP Extractor

When used the `run.sh` with DeepDive, sometime ideal parallelism
cannot be achieved because of memory problems. In this case, we
recommend to use the `run_parallel.sh`. It does the following steps:

1. Split your input file `INPUT_FILE` into chunks in `INPUT_FILE.split/`
2. Uses system parallelism tool `xargs` to run `run.sh` in parallel.
The outputs are saved to `INPUT_FILE.split/*.out`.

Run it with the following command

./run_parallel.sh INPUT_FILE PARALLELISM [INPUT_BATCH_SIZE=100] [SENTENCE_WORDS_LIMIT=120]

- `INPUT_FILE`: your input TSV file
- `PARALLELISM`: a number indicating desired parallelism. e.g.: 8
- `INPUT_BATCH_SIZE`: how many lines are in each file after split.
Default 100.
- `SENTENCE_WORDS_LIMIT`: Do not run dependency parsing if number of
words in sentence is larger than this number. This helps in speeding
up the parsing.

When finished, you should manually import the files in
`INPUT_FILE.split/*.out` into your database. You can use a COPY query
like this:

cat INPUT_FILE.split/*.out | psql YOUR_DB_NAME -c "COPY sentences FROM STDIN"
22 changes: 22 additions & 0 deletions examples/nlp_extractor_tsv/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import com.typesafe.sbt.SbtStartScript

name := "deepdive-nlp-parser"

version := "0.1"

scalaVersion := "2.10.3"

resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"

libraryDependencies ++= List(
"ch.qos.logback" % "logback-classic" % "1.0.7",
"com.typesafe.play" %% "play-json" % "2.2.1",
"com.github.scopt" %% "scopt" % "3.2.0",
"edu.stanford.nlp" % "stanford-corenlp" % "3.3.1",
"edu.stanford.nlp" % "stanford-corenlp" % "3.3.1" classifier "models",
"org.scalatest" % "scalatest_2.10" % "2.0.RC2" % "test"
)

parallelExecution in Test := false

seq(SbtStartScript.startScriptForClassesSettings: _*)
1 change: 1 addition & 0 deletions examples/nlp_extractor_tsv/project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0")
5 changes: 5 additions & 0 deletions examples/nlp_extractor_tsv/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! /usr/bin/env bash

# export SBT_OPTS="-Xmx1g"

$(dirname $0)/target/start $@
Loading

0 comments on commit 4fdfe91

Please sign in to comment.