Skip to content

Commit

Permalink
Merge pull request #23 from crim-ca/fix-nlp-docker
Browse files Browse the repository at this point in the history
  • Loading branch information
fmigneault authored Dec 13, 2023
2 parents aae8dd4 + d225ba2 commit 0cc997c
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 15 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@
# disallow notebooks at root
./*.ipynb
**/.ipynb_checkpoints/

### Outputs
*.log
18 changes: 18 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Included custom configs change the value of MAKEFILE_LIST
# Extract the required reference beforehand so we can use it for help target
MAKEFILE_NAME := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
# Include custom config if it is available
-include Makefile.config

# Application
APP_ROOT := $(abspath $(lastword $(MAKEFILE_NAME))/..)
APP_NAME := $(shell basename $(APP_ROOT))
APP_DOMAINS ?= eo nlp
DOCKER_REPO ?= crim-ca/pavics-jupyter-images

DOCKER_BUILDS := $(addprefix docker-build-, $(APP_DOMAINS))
$(DOCKER_BUILDS): docker-build-%:
docker build -t $(DOCKER_REPO)/$*:latest "$(APP_ROOT)/$(*)" 2>&1 | tee "$(APP_ROOT)/make-$@.log"

.PHONY: docker-build
docker-build: $(DOCKER_BUILDS)
41 changes: 26 additions & 15 deletions nlp/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@ FROM birdhouse/pavics-jupyter-base:0.5.0

# must update conda env as root, because of a permission error when having pip dependencies in the input yml file
USER root
# install pre-requirement for compiling some dependencies
RUN apt install -y pkg-config libpcre++-dev

COPY environment.yml /environment.yml
COPY notebook_config.yml /notebook_config.yml

# update env "birdy"
# use umask 0000 so that package files for the updated environment are usable by the user for the jupyter-conda-extension
RUN umask 0000 && mamba env update -f /environment.yml \
&& mamba clean -a
RUN umask 0000 && \
mamba env update -f /environment.yml && \
mamba clean -y -a

# Set the encoding to UTF-8, this is needed for heideltime to work properly
ENV LANG=C.UTF-8
Expand All @@ -19,40 +21,49 @@ RUN python -m spacy download en_core_web_trf

# Downloading the ner-large flair model
RUN mkdir flair_models && \
curl -L -o flair_models/ner-large https://huggingface.co/flair/ner-english-large/resolve/main/pytorch_model.bin > /dev/null
curl -L -o flair_models/ner-large \
https://huggingface.co/flair/ner-english-large/resolve/main/pytorch_model.bin > /dev/null

# Heideltime Tree-tagger Installation
RUN mkdir -p heideltime/tree-tagger-linux-3.2.3 && cd heideltime/tree-tagger-linux-3.2.3 && \
curl -o tree-tagger-linux-3.2.3.tar.gz https://www.cis.lmu.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.3.tar.gz && \
curl -o tagger-scripts.tar.gz https://www.cis.lmu.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \
curl -o english.par.gz https://www.cis.lmu.de/~schmid/tools/TreeTagger/data/english.par.gz && \
curl -o install-tagger.sh https://www.cis.lmu.de/~schmid/tools/TreeTagger/data/install-tagger.sh && \
curl -o tree-tagger-linux-3.2.3.tar.gz \
https://www.cis.lmu.de/~schmid/tools/TreeTagger/data/tree-tagger-linux-3.2.3.tar.gz && \
curl -o tagger-scripts.tar.gz \
https://www.cis.lmu.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \
curl -o english.par.gz \
https://www.cis.lmu.de/~schmid/tools/TreeTagger/data/english.par.gz && \
curl -o install-tagger.sh \
https://www.cis.lmu.de/~schmid/tools/TreeTagger/data/install-tagger.sh && \
tar -xvzf tree-tagger-linux-3.2.3.tar.gz && \
sh install-tagger.sh && \
rm tree-tagger-linux-3.2.3.tar.gz tagger-scripts.tar.gz english.par.gz install-tagger.sh

# Download and extract heideltime.standalone.jar
RUN cd heideltime && \
curl -L -o heideltime-standalone-2.2.1.tar.gz https://github.com/HeidelTime/heideltime/releases/download/VERSION2.2.1/heideltime-standalone-2.2.1.tar.gz && \
RUN cd heideltime && \
curl -L -o heideltime-standalone-2.2.1.tar.gz \
https://github.com/HeidelTime/heideltime/releases/download/VERSION2.2.1/heideltime-standalone-2.2.1.tar.gz && \
tar -xzvf heideltime-standalone-2.2.1.tar.gz heideltime-standalone/de.unihd.dbs.heideltime.standalone.jar && \
mv heideltime-standalone/de.unihd.dbs.heideltime.standalone.jar . && \
rmdir heideltime-standalone && \
rm heideltime-standalone-2.2.1.tar.gz

# Give read&write permission to jenkins for config
RUN chown -R jenkins heideltime

# Setup Haskell for Duckling server
# https://github.com/facebook/duckling
RUN curl -sSL https://get.haskellstack.org/ | sh && \
RUN curl -sSL https://get.haskellstack.org/ | bash && \
git clone https://github.com/facebook/duckling && \
cd duckling && \
stack build && \
stack install && \
cd .. && \
rm -fr duckling

# Give read&write permission to jenkins for config
RUN chown -R jenkins heideltime
ENV PATH="/root/.local/bin:$PATH"

# Give ownership of the conda cache folder to jenkins, to enable installing packages by the user from JupyterLab
RUN mkdir /opt/conda/pkgs/cache && chown -R 1000:1000 /opt/conda/pkgs/cache

COPY notebook_config.yml /notebook_config.yml

# specify user because of problem running start-notebook.sh when being root
USER jenkins

0 comments on commit 0cc997c

Please sign in to comment.