Skip to content
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ There is a minimal version, which contains only Apache Tika and it's core depend
* Italian
* Spanish.

To install more languages simply update the apt-get command to include the package containing the language you required, or include your own custom packs using an ADD command.
To install more languages simply use `docker-build.sh` or manually using [docker --build-arg](https://docs.docker.com/engine/reference/commandline/build/#set-build-time-variables---build-arg)

Obtain a list of official Tesseract packages by executing (on Linux):

apt-cache search --names-only '^tesseract-ocr-[a-z]{3}$'

## Available Tags

Expand Down
23 changes: 15 additions & 8 deletions docker-tool.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ while getopts ":h" opt; do
case ${opt} in
h )
echo "Usage:"
echo " docker-tool.sh -h Display this help message."
echo " docker-tool.sh build <TIKA_VERSION> Builds images for <TIKA_VERSION>."
echo " docker-tool.sh test <TIKA_VERSION> Tests images for <TIKA_VERSION>."
echo " docker-tool.sh publish <TIKA_VERSION> Publishes images for <TIKA_VERSION> to Docker Hub."
echo " docker-tool.sh latest <TIKA_VERSION> Tags images for <TIKA_VERSION> as latest on Docker Hub."
echo " docker-tool.sh -h Display this help message."
echo " docker-tool.sh build <TIKA_VERSION> [<TIKA_JAR_NAME>] [<TESSERACT_LANGUAGES>] Builds images for <TIKA_VERSION>, apply [<TIKA_JAR_NAME>], via special [<TESSERACT_LANGUAGES>]."
echo " docker-tool.sh test <TIKA_VERSION> Tests images for <TIKA_VERSION>."
echo " docker-tool.sh publish <TIKA_VERSION> Publishes images for <TIKA_VERSION> to Docker Hub."
echo " docker-tool.sh latest <TIKA_VERSION> Tags images for <TIKA_VERSION> as latest on Docker Hub."
echo ""
echo "Note: [<TESSERACT_LANGUAGES>] is optional for full image,"
echo " to customize various tesseract-ocr packages. Otherwise the default packages are installed."
exit 0
;;
\? )
Expand Down Expand Up @@ -59,19 +62,23 @@ shift $((OPTIND -1))
subcommand=$1; shift
version=$1; shift
jar=$1; shift
tesseract_languages=$@

if [ -z "$jar" ]
then
jar="tika-server"
fi


case "$subcommand" in
build)
build_args="--build-arg TIKA_VERSION=${version} --build-arg TIKA_JAR_NAME=${jar}"
if [[ ! -z "$tesseract_languages" ]]; then
build_args="$build_args --build-arg TESSERACT_LANGUAGES='${tesseract_languages}'"
fi
# Build slim version with minimal dependencies
docker build -t apache/tika:${version} --build-arg TIKA_VERSION=${version} --build-arg TIKA_JAR_NAME=${jar} - < minimal/Dockerfile --no-cache
eval "docker build -t apache/tika:${version} ${build_args} - < minimal/Dockerfile --no-cache"
# Build full version with OCR, Fonts and GDAL
docker build -t apache/tika:${version}-full --build-arg TIKA_VERSION=${version} --build-arg TIKA_JAR_NAME=${jar} - < full/Dockerfile --no-cache
eval "docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile --no-cache"
;;

test)
Expand Down
4 changes: 2 additions & 2 deletions full/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ RUN apt-get update

FROM base as dependencies
ARG JRE='openjdk-14-jre-headless'
ARG TESSERACT_LANGUAGES='tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu'

RUN DEBIAN_FRONTEND=noninteractive apt-get -y install $JRE gdal-bin tesseract-ocr \
tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu
RUN DEBIAN_FRONTEND=noninteractive apt-get -y install $JRE gdal-bin tesseract-ocr $TESSERACT_LANGUAGES

RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y xfonts-utils fonts-freefont-ttf fonts-liberation ttf-mscorefonts-installer wget cabextract
Expand Down