This repository has been archived by the owner on Mar 30, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathDockerfile
152 lines (110 loc) · 4.25 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
FROM python:3.7.10-buster AS base
# shared between builder and runtime image
RUN apt-get update \
&& apt-get install --assume-yes --no-install-recommends \
dumb-init \
poppler-utils \
libgl1 \
# install LibreOffice Write to convert Word to PDF
# also install fonts and fontconfig to provide common fonts
# or configuration to their alternatives
libreoffice-writer \
python3-uno \
fonts-liberation \
fonts-liberation2 \
fonts-crosextra-carlito \
fonts-crosextra-caladea \
fontconfig \
&& rm -rf /var/lib/apt/lists/*
# set and check UNO_PATH, UNO_PYTHON_PATH and UNO_OFFICE_BINARY_PATH
ENV UNO_PATH=/usr/lib/python3/dist-packages
ENV UNO_PYTHON_PATH=python3.7
ENV UNO_OFFICE_BINARY_PATH=/usr/lib/libreoffice/program/soffice.bin
RUN \
echo "UNO_PATH: ${UNO_PATH}" \
&& ls -l "${UNO_PATH}" \
&& echo "UNO_PYTHON_PATH: ${UNO_PYTHON_PATH}" \
&& PYTHONPATH="${UNO_PATH}" "${UNO_PYTHON_PATH}" -c 'import uno, unohelper' \
&& echo "UNO_OFFICE_BINARY_PATH: ${UNO_OFFICE_BINARY_PATH}" \
&& ls -l "${UNO_OFFICE_BINARY_PATH}"
WORKDIR /opt/sciencebeam_parser
ENV VENV=/opt/venv
ENV VIRTUAL_ENV=${VENV} PYTHONUSERBASE=${VENV} PATH=${VENV}/bin:$PATH
# builder-base
FROM base AS builder-base
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
gcc \
libtesseract4 \
tesseract-ocr-eng \
libtesseract-dev \
libleptonica-dev \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.build.txt ./
RUN python3 -m venv ${VENV} \
&& pip install --disable-pip-version-check --no-warn-script-location -r requirements.build.txt
FROM builder-base AS builder
COPY requirements.cpu.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.cpu.txt
COPY requirements.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.txt
COPY requirements.delft.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.delft.txt --no-deps
# builder
FROM builder-base AS builder-cv
COPY requirements.cpu.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.cpu.txt
COPY requirements.cv.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.cv.txt
COPY requirements.ocr.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.ocr.txt
COPY requirements.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.txt
COPY requirements.delft.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.delft.txt --no-deps
# dev image
FROM builder-cv AS dev
COPY requirements.dev.txt ./
RUN pip install --disable-pip-version-check --no-warn-script-location \
-r requirements.dev.txt
COPY sciencebeam_parser ./sciencebeam_parser
COPY tests ./tests
COPY test-data ./test-data
COPY scripts/dev ./scripts/dev
COPY doc ./doc
COPY .flake8 .pylintrc setup.py MANIFEST.in README.md ./
# temporary workaround for tesserocr https://github.com/sirfz/tesserocr/issues/165
ENV LC_ALL=C
# runtime image
FROM base AS runtime
COPY --from=builder /opt/venv /opt/venv
COPY sciencebeam_parser ./sciencebeam_parser
COPY docker/entrypoint.sh ./docker/entrypoint.sh
ENV SCIENCEBEAM_DELFT_MAX_SEQUENCE_LENGTH=2000
ENV SCIENCEBEAM_DELFT_INPUT_WINDOW_STRIDE=1800
CMD [ "--port=8070", "--host=0.0.0.0" ]
ENTRYPOINT ["/usr/bin/dumb-init", "--", "/opt/sciencebeam_parser/docker/entrypoint.sh"]
# runtime-cv image
FROM base AS runtime-cv
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libtesseract4 \
tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
COPY --from=builder-cv /opt/venv /opt/venv
COPY sciencebeam_parser ./sciencebeam_parser
COPY docker/entrypoint.sh ./docker/entrypoint.sh
ENV SCIENCEBEAM_DELFT_MAX_SEQUENCE_LENGTH=2000
ENV SCIENCEBEAM_DELFT_INPUT_WINDOW_STRIDE=1800
# temporary workaround for tesserocr https://github.com/sirfz/tesserocr/issues/165
ENV LC_ALL=C
CMD [ "--port=8070", "--host=0.0.0.0" ]
ENTRYPOINT ["/usr/bin/dumb-init", "--", "/opt/sciencebeam_parser/docker/entrypoint.sh"]