Skip to content
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ There is a minimal version, which contains only Apache Tika and it's core depend
* Italian
* Spanish.

To install more languages simply update the apt-get command to include the package containing the language you required, or include your own custom packs using an ADD command.
To install more languages simply use `docker-build.sh` or manually using [docker --build-arg](https://docs.docker.com/engine/reference/commandline/build/#set-build-time-variables---build-arg)

For see with version is supported by tesseract on official package:

apt-cache search --names-only '^tesseract-ocr-[a-z]{3}$'

## Available Tags

Expand Down
20 changes: 14 additions & 6 deletions docker-tool.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ while getopts ":h" opt; do
case ${opt} in
h )
echo "Usage:"
echo " docker-tool.sh -h Display this help message."
echo " docker-tool.sh build <TIKA_VERSION> Builds images for <TIKA_VERSION>."
echo " docker-tool.sh test <TIKA_VERSION> Tests images for <TIKA_VERSION>."
echo " docker-tool.sh publish <TIKA_VERSION> Publishes images for <TIKA_VERSION> to Docker Hub."
echo " docker-tool.sh latest <TIKA_VERSION> Tags images for <TIKA_VERSION> as latest on Docker Hub."
echo " docker-tool.sh -h Display this help message."
echo " docker-tool.sh build <TIKA_VERSION> [<TESSERACT_LANGUAGES>] Builds images for <TIKA_VERSION> via special [<TESSERACT_LANGUAGES>]."
echo " docker-tool.sh test <TIKA_VERSION> Tests images for <TIKA_VERSION>."
echo " docker-tool.sh publish <TIKA_VERSION> Publishes images for <TIKA_VERSION> to Docker Hub."
echo " docker-tool.sh latest <TIKA_VERSION> Tags images for <TIKA_VERSION> as latest on Docker Hub."
echo ""
echo "Note: [<TESSERACT_LANGUAGES>] is optional for full image,"
echo " for change default tesseract-ocr packages."
exit 0
;;
\? )
Expand Down Expand Up @@ -58,13 +61,18 @@ test_docker_image() {
shift $((OPTIND -1))
subcommand=$1; shift
version=$1; shift
tesseract_languages=$@

case "$subcommand" in
build)
build_args="--build-arg TIKA_VERSION=${version}"
if [[ ! -z "$tesseract_languages" ]]; then
build_args="$build_args --build-arg TESSERACT_LANGUAGES='${tesseract_languages}'"
fi
# Build slim version with minimal dependencies
docker build -t apache/tika:${version} --build-arg TIKA_VERSION=${version} - < minimal/Dockerfile --no-cache
# Build full version with OCR, Fonts and GDAL
docker build -t apache/tika:${version}-full --build-arg TIKA_VERSION=${version} - < full/Dockerfile --no-cache
docker build -t apache/tika:${version}-full ${build_args} - < full/Dockerfile --no-cache
;;

test)
Expand Down
4 changes: 2 additions & 2 deletions full/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ RUN apt-get update

FROM base as dependencies
ARG JRE='openjdk-14-jre-headless'
ARG TESSERACT_LANGUAGES='tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu'

RUN DEBIAN_FRONTEND=noninteractive apt-get -y install $JRE gdal-bin tesseract-ocr \
tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu
RUN DEBIAN_FRONTEND=noninteractive apt-get -y install $JRE gdal-bin tesseract-ocr $TESSERACT_LANGUAGES

RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y xfonts-utils fonts-freefont-ttf fonts-liberation ttf-mscorefonts-installer wget cabextract
Expand Down