|
1 | | -# ─── Stage 1: heavy stable dependencies (variant-aware) ────────────────────── |
2 | | -# Two image variants are published from this Dockerfile: |
3 | | -# - slim (default, `:latest`) — ~450 MB. cocoindex-code + LiteLLM only. |
4 | | -# For users who'll point the embedding at a cloud provider (OpenAI, |
5 | | -# Voyage, Gemini, …). |
6 | | -# - full (`:full`) — ~5 GB. Also bundles sentence-transformers |
7 | | -# + torch + a pre-baked default model. For users who want offline-ready |
8 | | -# local embeddings without an API key. |
| 1 | +# Single-stage image with cache-friendly layer ordering so user `docker pull`s |
| 2 | +# on upgrade only fetch the small per-release layer. |
9 | 3 | # |
10 | | -# This stage installs only the big, slow-changing deps that are shared across |
11 | | -# releases: |
12 | | -# - full: `sentence-transformers` (pulls torch + transformers + tokenizers |
13 | | -# transitively, ~1 GB of wheels). |
14 | | -# - slim: nothing — cocoindex-code's LiteLLM deps get installed in stage 2. |
| 4 | +# Stable layers (reuse across releases — digest reproducible from the RUN |
| 5 | +# command string + base image, so users keep them in local cache): |
| 6 | +# 1. apt install gosu + create coco user |
| 7 | +# 2. install uv |
| 8 | +# 3. (full only) `uv pip install sentence-transformers` — ~1 GB of torch + |
| 9 | +# transformers. This is the heavy, slow-changing layer we're optimizing |
| 10 | +# around. |
| 11 | +# 4. (full only) pre-bake the default embedding model under |
| 12 | +# /var/cocoindex/cache/... so the named volume's copy-up populates it |
| 13 | +# on first start without a network fetch. |
| 14 | +# 5. writable-path setup (mkdir /var/cocoindex/db + /var/run/cocoindex_code, |
| 15 | +# chown to coco) + env vars + entrypoint copy. |
15 | 16 | # |
16 | | -# The cache key is the RUN command string, which changes with CCC_VARIANT, so |
17 | | -# BuildKit keeps separate cache entries per variant and reuses each across |
18 | | -# releases until we bump the deps. |
| 17 | +# Per-release layers (invalidate when the source tree changes): |
| 18 | +# 6. COPY . /ccc-src — build context (~MB). |
| 19 | +# 7. `uv pip install "cocoindex>=..." "${CCC_INSTALL_SPEC}"` — installs |
| 20 | +# cocoindex + cocoindex-code + any of their deps not already in place |
| 21 | +# from layer 3. Per-release layer size is bounded by what cocoindex + |
| 22 | +# cocoindex-code + their non-ST deps actually occupy (~tens of MB). |
19 | 23 | # |
20 | | -# `cocoindex` and `cocoindex-code` are deliberately NOT installed here — |
21 | | -# they bump often, so pinning them at this layer would invalidate the heavy |
22 | | -# cache on every release. Stage 2 installs them on top; transitive deps are |
23 | | -# already satisfied, so uv only fetches the two packages themselves. |
| 24 | +# Two image variants are published per release: |
| 25 | +# - slim (default, `:latest`) — ~450 MB. Layer 3 is a no-op; cocoindex-code's |
| 26 | +# LiteLLM deps install in layer 7. |
| 27 | +# - full (`:full`) — ~5 GB. Layer 3 + Layer 4 bundle torch + |
| 28 | +# sentence-transformers + a baked model for offline-ready local embeddings. |
24 | 29 | # |
25 | 30 | # Use slim (glibc-based) — cocoindex ships pre-built Rust wheels that need glibc. |
26 | 31 | # Alpine / musl-libc would require building from source. |
27 | 32 | # |
28 | 33 | # `--system` tells uv to install into the base Python at |
29 | 34 | # /usr/local/lib/python3.12/... since there's no virtualenv in the image. |
30 | | -FROM python:3.12-slim AS deps |
| 35 | + |
| 36 | +FROM python:3.12-slim |
| 37 | + |
| 38 | +RUN apt-get update \ |
| 39 | + && apt-get install -y --no-install-recommends gosu \ |
| 40 | + && rm -rf /var/lib/apt/lists/* \ |
| 41 | + && groupadd -g 1000 coco \ |
| 42 | + && useradd -u 1000 -g 1000 -m coco |
31 | 43 |
|
32 | 44 | RUN pip install --quiet uv |
33 | 45 |
|
| 46 | +# Heavy, stable deps for full variant. Layer digest is reproducible across |
| 47 | +# releases (RUN command string is constant), so users skip re-downloading |
| 48 | +# this layer on upgrade. |
34 | 49 | ARG CCC_VARIANT=slim |
35 | 50 | RUN if [ "$CCC_VARIANT" = "full" ]; then \ |
36 | 51 | uv pip install --system --prerelease=allow sentence-transformers; \ |
37 | 52 | fi |
38 | 53 |
|
39 | | -# ─── Stage 2: install cocoindex + cocoindex-code (per release) ─────────────── |
40 | | -# Cheap relative to stage 1: transitive deps like torch are already in place |
41 | | -# for the full variant; for slim there are no heavy deps to pull. uv only |
42 | | -# needs to fetch the cocoindex + cocoindex-code wheels themselves. |
43 | | -FROM deps AS builder |
44 | | -WORKDIR /build |
45 | | -ARG CCC_VARIANT=slim |
46 | | - |
47 | | -# Default behaviour: install cocoindex-code from PyPI, picking the extras |
48 | | -# that match CCC_VARIANT. |
49 | | -# Release workflow / local tests override with (respectively): |
50 | | -# --build-arg CCC_INSTALL_SPEC=/ccc-src |
51 | | -# --build-arg CCC_INSTALL_SPEC=/ccc-src[full] |
52 | | -ARG CCC_INSTALL_SPEC="" |
53 | | -COPY . /ccc-src |
54 | | -RUN if [ -z "$CCC_INSTALL_SPEC" ]; then \ |
55 | | - if [ "$CCC_VARIANT" = "full" ]; then \ |
56 | | - CCC_INSTALL_SPEC="cocoindex-code[full]"; \ |
57 | | - else \ |
58 | | - CCC_INSTALL_SPEC="cocoindex-code"; \ |
59 | | - fi; \ |
60 | | - fi; \ |
61 | | - uv pip install --system --prerelease=allow \ |
62 | | - "cocoindex>=1.0.0a33" \ |
63 | | - "${CCC_INSTALL_SPEC}" |
64 | | - |
65 | | -# ─── Stage 3: pre-bake the default embedding model (full only) ─────────────── |
66 | | -# For the full variant, bakes Snowflake/snowflake-arctic-embed-xs into |
67 | | -# /var/cocoindex/cache/... so Docker's first-mount copy-up populates the |
68 | | -# cocoindex-data volume with the model — no network fetch on first start. |
69 | | -# For slim, just creates empty cache dirs so the runtime stage's COPY works |
70 | | -# regardless of variant. |
71 | | -FROM builder AS model_cache |
72 | | -ARG CCC_VARIANT=slim |
73 | | - |
74 | 54 | ENV HF_HOME=/var/cocoindex/cache/huggingface \ |
75 | 55 | SENTENCE_TRANSFORMERS_HOME=/var/cocoindex/cache/sentence-transformers |
76 | 56 |
|
| 57 | +# Pre-bake the default embedding model (full only). For slim, just create |
| 58 | +# empty cache dirs so the cocoindex-data named volume mounts cleanly. |
77 | 59 | RUN mkdir -p /var/cocoindex/cache/huggingface /var/cocoindex/cache/sentence-transformers \ |
78 | 60 | && if [ "$CCC_VARIANT" = "full" ]; then \ |
79 | 61 | python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('Snowflake/snowflake-arctic-embed-xs'); print('Model cached.')"; \ |
80 | 62 | fi |
81 | 63 |
|
82 | | -# ─── Stage 4: runtime ───────────────────────────────────────────────────────── |
83 | | -FROM python:3.12-slim AS runtime |
84 | | - |
85 | | -# gosu for privilege-drop (PUID/PGID pattern); create non-root coco user. |
86 | | -RUN apt-get update \ |
87 | | - && apt-get install -y --no-install-recommends gosu \ |
88 | | - && rm -rf /var/lib/apt/lists/* \ |
89 | | - && groupadd -g 1000 coco \ |
90 | | - && useradd -u 1000 -g 1000 -m coco |
91 | | - |
92 | | -# Copy installed packages + pre-baked model from previous stages. |
93 | | -COPY --from=model_cache /usr/local/lib/python3.12 /usr/local/lib/python3.12 |
94 | | -COPY --from=model_cache /usr/local/bin/cocoindex-code /usr/local/bin/cocoindex-code |
95 | | -COPY --from=model_cache /usr/local/bin/ccc /usr/local/bin/ccc |
96 | | -COPY --from=model_cache /var/cocoindex/cache /var/cocoindex/cache |
97 | | - |
98 | | -# Pre-create writable paths so the entrypoint's chown (under PUID) works even on |
99 | | -# a fresh container, and so the default root-uid path has them in place. |
| 64 | +# Writable paths the daemon needs, pre-chowned to coco. Under PUID/PGID the |
| 65 | +# entrypoint re-chowns to the host user; under root (Docker Desktop |
| 66 | +# default) coco-ownership is harmless since processes run as root and can |
| 67 | +# write anywhere. |
100 | 68 | RUN mkdir -p /var/cocoindex/db /var/run/cocoindex_code \ |
101 | 69 | && chown -R coco:coco /var/cocoindex /var/run/cocoindex_code |
102 | 70 |
|
103 | 71 | WORKDIR /workspace |
104 | 72 |
|
105 | | -# ── Runtime defaults (all overridable via -e / --env) ───────────────────────── |
106 | | -# |
107 | | -# COCOINDEX_CODE_DIR — holds global_settings.yml on the bind mount so users can |
108 | | -# edit it directly on the host. |
109 | | -# COCOINDEX_CODE_RUNTIME_DIR — keeps daemon.sock/pid/log on the container's |
110 | | -# native filesystem (AF_UNIX sockets on bind mounts are unreliable on |
111 | | -# Docker Desktop, and /var/run is the standard spot for ephemeral runtime |
112 | | -# state — wiped on container recreate, no stale-socket risk). |
113 | | -# COCOINDEX_CODE_DB_PATH_MAPPING — keeps the indexer's LMDB + SQLite databases |
114 | | -# on the native filesystem for speed and correctness. |
115 | | -# HF_HOME / SENTENCE_TRANSFORMERS_HOME — direct the model cache at the path |
116 | | -# the cocoindex-data volume mounts over. |
| 73 | +# Runtime defaults — see the spec for what each does. All overridable at |
| 74 | +# `docker run -e ...` time. |
117 | 75 | ENV COCOINDEX_CODE_DIR=/workspace/.cocoindex_code \ |
118 | 76 | COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code \ |
119 | 77 | COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db \ |
120 | | - COCOINDEX_CODE_DAEMON_SUPERVISED=1 \ |
121 | | - HF_HOME=/var/cocoindex/cache/huggingface \ |
122 | | - SENTENCE_TRANSFORMERS_HOME=/var/cocoindex/cache/sentence-transformers |
123 | | - |
124 | | -# Set COCOINDEX_CODE_HOST_PATH_MAPPING at run time — it depends on the host path |
125 | | -# the user bind-mounts to /workspace and can't be baked into the image. |
| 78 | + COCOINDEX_CODE_DAEMON_SUPERVISED=1 |
126 | 79 |
|
127 | 80 | COPY docker/entrypoint.sh /entrypoint.sh |
128 | 81 | RUN chmod +x /entrypoint.sh |
129 | 82 | ENTRYPOINT ["/entrypoint.sh"] |
| 83 | + |
| 84 | +# ─── Per-release layer (last so only this one invalidates per release) ───── |
| 85 | +# |
| 86 | +# Default (PyPI flow): install cocoindex-code from PyPI, picking the extras |
| 87 | +# that match CCC_VARIANT. |
| 88 | +# Release workflow / local tests override with (respectively): |
| 89 | +# --build-arg CCC_INSTALL_SPEC=/ccc-src |
| 90 | +# --build-arg CCC_INSTALL_SPEC=/ccc-src[full] |
| 91 | +# to install from the source tree. `rw=true` on the bind mount gives |
| 92 | +# hatch-vcs a writable overlay for `_version.py` during the PEP 517 build; |
| 93 | +# the overlay is discarded after the RUN, so the source tree doesn't |
| 94 | +# persist as a layer in the final image. |
| 95 | +ARG CCC_INSTALL_SPEC="" |
| 96 | +RUN --mount=type=bind,source=.,target=/ccc-src,rw=true \ |
| 97 | + if [ -z "$CCC_INSTALL_SPEC" ]; then \ |
| 98 | + if [ "$CCC_VARIANT" = "full" ]; then \ |
| 99 | + CCC_INSTALL_SPEC="cocoindex-code[full]"; \ |
| 100 | + else \ |
| 101 | + CCC_INSTALL_SPEC="cocoindex-code"; \ |
| 102 | + fi; \ |
| 103 | + fi; \ |
| 104 | + uv pip install --system --prerelease=allow \ |
| 105 | + "cocoindex>=1.0.0a33" \ |
| 106 | + "${CCC_INSTALL_SPEC}" |
0 commit comments