From 4ebd73f067cf52f46b7774238107dd95d0d14224 Mon Sep 17 00:00:00 2001 From: Eamon Date: Sun, 31 May 2026 19:26:54 +0530 Subject: [PATCH] exp(#58) * feat(ci): optimize workflow pipeline and update docker configurations * feat(ci): optimize workflow pipeline and update docker configurations * feat(ci): optimize workflow pipeline and update docker configurations * feat(ci): optimize workflow pipeline and update docker configurations * feat(ci): optimize workflow pipeline and update docker configurations * feat(ci): optimize workflow pipeline and update docker configurations * feat(ci): optimize workflow pipeline and update docker configurations * feat(ci): optimize workflow pipeline and update docker configurations * refactor(ci): optimize workflow pipeline and update docker configurations * refactor : optimize workflow pipeline and update docker configurations * refactor : optimize workflow pipeline and update docker configurations * refactor : optimize workflow pipeline and update docker configurations * Added MIT LICENSE to this project Quadtrix.cpp * Refactor Dockerfile to use ARG for CUDA version * Refactor Dockerfile for backend dependencies * refactor : Dockerfile.backend optimize workflow pipeline * refactor : Dockerfile.backend optimize workflow pipeline * refactor : Dockerfile.backend optimize workflow pipeline * refactor : Dockerfile.backend optimize workflow pipeline * Delete .devops/Dockerfile.frontend * Delete .devops/Dockerfile.dev.frontend * refactor : Dockerfile.backend optimize workflow pipeline * refactor : Dockerfile.backend optimize workflow pipeline * refactored (CI): consolidated manual Docker build jobs into a matrix strategy to reduce duplication * refactored (CI): consolidated manual Docker build jobs into a matrix strategy to reduce duplication * refactor(ui): rewrite ThinkingIndicator to use inline styles and CSS keyframes * refactor : message bubble layout to use inline styles * refactor(ui): complete inline-style migration and update auto-scroll implementation * refactor(ui): complete inline-style migration for MessageAvatar component * refactor(ui): rewrite EmptyState component using pure inline styles * refactored(tensor): vectorize element-wise addition and scalar scaling using AVX/SSE - Added SIMD vectorization support (`__AVX__` and `__SSE__`) for element-wise `add`, `add_inplace`, and `scale` operations. - Maintained scalar fallback paths for non-vectorized bounds and platforms lacking hardware extensions. - Explicitly defined rule-of-five constructors (`default` and `noexcept` moves) within the `Tensor` struct layout. - Optimized vector initialization across the core construct layer via `std::move` and `std::vector::reserve`. * refactor(main): redesign training loop to log per-step and sample during evaluation - Replaced the periodic block evaluation layout with standard, per-step logging metrics (`loss`, `ms`, and `tok/s`). - Shifted initial validation loss calculation out of the iteration cycle to establish a zero-state baseline. - Restructured token streaming so that generations are triggered conditionally inside the training loop post-evaluation windows. - Streamlined architecture parameter reporting and consolidated command-line configuration visual prints. * feat: implement GPT training loop with multi-GPU and memory optimizations - Add advanced memory footprint optimization using forward-activation recomputation for LayerNorm and GeLU. - Optimize layer-wise activation buffer layout using a centralized `TensorSpec` registry to support large batch scaling. - Integrate cuBLASLt matmul fusions, optional cuDNN attention layers, and stochastic rounding options. - Fall back gracefully to `cudaMallocManaged` under heavy loads to prevent Outlier/OOM crashes. * Update README.md with new banner for qudtrix.cpp --------- Co-authored-by: Max --- Dockerfile => .devops/Dockerfile | 2 +- Dockerfile.cuda => .devops/Dockerfile.backend | 0 .devops/Dockerfile.cpp | 65 + .devops/nginx.conf | 47 + .dockerignore | 57 +- .github/workflows/ci.yml | 238 +- .github/workflows/docker-publish.yml | 163 +- .github/workflows/pr-check.yml | 238 ++ CUDA/main.cu | 2070 +++++++++++++++++ LICENSE | 2 +- Makefile | 104 + README.md | 4 + config/config.h | 20 +- docker-compose.dev.yml | 45 + docker-compose.gpu.yml | 32 + docker-compose.yml | 181 +- frontend/src/components/chat/EmptyState.tsx | 96 +- .../src/components/chat/MessageAvatar.tsx | 45 +- frontend/src/components/chat/MessageList.tsx | 21 +- frontend/src/components/chat/MessageRow.tsx | 87 +- .../src/components/chat/ThinkingIndicator.tsx | 28 +- include/tensor.h | 749 ++++-- main.cpp | 193 +- run.md | 492 ---- scripts/build.sh | 161 ++ 25 files changed, 4077 insertions(+), 1063 deletions(-) rename Dockerfile => .devops/Dockerfile (94%) rename Dockerfile.cuda => .devops/Dockerfile.backend (100%) create mode 100644 .devops/Dockerfile.cpp create mode 100644 .devops/nginx.conf create mode 100644 .github/workflows/pr-check.yml create mode 100644 CUDA/main.cu create mode 100644 Makefile create mode 100644 docker-compose.dev.yml create mode 100644 docker-compose.gpu.yml delete mode 100644 run.md create mode 100644 scripts/build.sh diff --git a/Dockerfile b/.devops/Dockerfile similarity index 94% rename from Dockerfile rename to .devops/Dockerfile index 65fcca9..c7c0061 100644 --- a/Dockerfile +++ b/.devops/Dockerfile @@ -35,4 +35,4 @@ COPY . . ENV PATH="/app/venv/bin:$PATH" ENV PYTHONUNBUFFERED=1 -ENTRYPOINT ["python3", "engine/main.py"] \ No newline at end of file +ENTRYPOINT ["python3", "engine/main.py"] diff --git a/Dockerfile.cuda b/.devops/Dockerfile.backend similarity index 100% rename from Dockerfile.cuda rename to .devops/Dockerfile.backend diff --git a/.devops/Dockerfile.cpp b/.devops/Dockerfile.cpp new file mode 100644 index 0000000..0a1ce15 --- /dev/null +++ b/.devops/Dockerfile.cpp @@ -0,0 +1,65 @@ + +FROM ubuntu:24.04 AS builder + +LABEL stage=builder + +ARG DEBIAN_FRONTEND=noninteractive +ARG BUILD_TYPE=Release +ARG CMAKE_EXTRA_FLAGS="" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + g++ \ + cmake \ + ninja-build \ + ccache \ + git \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src + +COPY main.cpp ./ +COPY benchmark.cpp ./ +COPY config/ ./config/ +COPY include/ ./include/ +COPY data/ ./data/ + +# If model/Cmakelists.txt exists, use cmake; else fall back to direct g++ +RUN set -e; \ + if [ -f model/Cmakelists.txt ] || [ -f CMakeLists.txt ]; then \ + cmake -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${CMAKE_EXTRA_FLAGS} .; \ + cmake --build build --parallel "$(nproc)"; \ + else \ + g++ -std=c++17 -O3 -march=native \ + -I. -Iinclude \ + -o /usr/local/bin/quadtrix \ + main.cpp; \ + fi +FROM ubuntu:24.04 AS runtime + +LABEL org.opencontainers.image.title="Quadtrix.cpp Engine" +LABEL org.opencontainers.image.description="C++ transformer engine for local LM inference" +LABEL org.opencontainers.image.source="https://github.com/Eamon2009/Quadtrix.cpp" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libstdc++6 \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY --from=builder /usr/local/bin/quadtrix /usr/local/bin/quadtrix +COPY --from=builder /src/data/ ./data/ +VOLUME ["/models"] + +ENV GPT_DATA_PATH=/app/data/input.txt \ + GPT_MODEL_PATH=/models/best_model.bin + +EXPOSE 8080 + +ENTRYPOINT ["/usr/local/bin/quadtrix"] +CMD ["data/input.txt", "--chat"] diff --git a/.devops/nginx.conf b/.devops/nginx.conf new file mode 100644 index 0000000..5804e6e --- /dev/null +++ b/.devops/nginx.conf @@ -0,0 +1,47 @@ +# Quadtrix.cpp — Nginx config +# Serves the Vite SPA and proxies /api/* to the FastAPI backend + +server { + listen 80; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + # Gzip + gzip on; + gzip_types text/plain text/css application/json application/javascript + text/xml application/xml application/xml+rss text/javascript + application/wasm; + gzip_min_length 1024; + + # SPA fallback — all unknown routes return index.html + location / { + try_files $uri $uri/ /index.html; + } + + # Proxy API calls to FastAPI backend + location /api/ { + proxy_pass http://backend:3001; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_read_timeout 120s; + proxy_send_timeout 120s; + } + + # Static asset cache + location ~* \.(js|css|png|svg|ico|woff2|woff|ttf|webmanifest)$ { + expires 1y; + add_header Cache-Control "public, immutable"; + } + + # Service worker must not be cached + location = /sw.js { + add_header Cache-Control "no-cache"; + } +} diff --git a/.dockerignore b/.dockerignore index f001789..603874e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,35 +1,44 @@ + .git .gitignore .github .venv -**/__pycache__ -**/*.pyc -**/*.pyo -**/*.pyd -engine/logs/ +__pycache__ +*.pyc +*.pyo +*.pyd +*.egg-info +.pytest_cache +.ruff_cache +dist/ +build/ +*.egg node_modules frontend/node_modules -.npm-cache -frontend/.vite frontend/dist - -# Model weights -*.pt -*.bin -models/ - -# Windows build artifacts -*.exe +frontend/.vite +*.npm-cache +.npmignore +*.o +*.a +*.so +*.dylib quadtrix.exe -*.png -*.jpg -*.jpeg -*.md -LICENSE -contributing.md -SECURITY.md -run.md +quadtrix +build/ +cmake-build-*/ +.vscode +*.bin +*.pt +*.gguf +*.safetensors +engine/best_model.pt +engine/logs/ +engine/fineweb_30mb.txt +data/input.txt .DS_Store Thumbs.db +*.swp +*.swo .idea -.vscode \ No newline at end of file +docker-compose.override.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 311ad33..bf49286 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,74 +2,216 @@ name: CI on: push: - branches: - - exp - - master - pull_request: - -permissions: - contents: read + branches: [master, dev] + workflow_dispatch: + inputs: + image: + description: "Which image to build?" + required: true + type: choice + options: + - cpp + - cpu + - cuda + - all + push: + description: "Push to ghcr.io?" + required: true + default: "true" + type: choice + options: ["true", "false"] + +env: + REGISTRY: ghcr.io + IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/quadtrix jobs: - cpp-build: - name: C++ build + + file-integrity: + name: File integrity + if: github.event_name == 'push' runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Check required files exist + run: | + files=( + "main.cpp" + "engine/main.py" + "requirements.txt" + ) + failed=0 + for f in "${files[@]}"; do + if [ -f "$f" ]; then + echo "✅ $f" + else + echo "❌ $f — MISSING" + failed=1 + fi + done + exit $failed + + + lint-python: + name: Python lint + if: github.event_name == 'push' + runs-on: ubuntu-latest steps: - - name: Check out repository - uses: actions/checkout@v4 + - uses: actions/checkout@v4 - - name: Install compiler - run: sudo apt-get update && sudo apt-get install -y g++ + - name: Lint engine/ (ruff) + uses: chartboost/ruff-action@v1 + with: + args: "check engine/ --ignore E501 --exit-zero" - - name: Build Quadtrix - run: g++ -std=c++17 -O2 -I. -Iinclude -o quadtrix main.cpp - backend-smoke: - name: Backend smoke checks + build-cpp: + name: C++ compile check + if: github.event_name == 'push' runs-on: ubuntu-latest - steps: - - name: Check out repository - uses: actions/checkout@v4 + - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.11" + - name: Install g++ + run: sudo apt-get update && sudo apt-get install -y g++ - - name: Install backend runtime dependencies + - name: Compile main.cpp run: | - python -m pip install --upgrade pip - pip install fastapi "uvicorn[standard]" pydantic pydantic-settings httpx redis + g++ -std=c++17 -O3 \ + -I. -Iinclude \ + -o quadtrix main.cpp - - name: Compile Python sources - run: python -m compileall backend engine + - name: Smoke test + run: ./quadtrix --help || true - - name: Import FastAPI application - working-directory: backend - run: | - python -c "from main import app; print(app.title)" - frontend-build: - name: Frontend build + build-cpp-image: + name: Build — cpp + if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cpp' || inputs.image == 'all') runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - uses: actions/checkout@v4 + + - uses: docker/setup-qemu-action@v3 + - uses: docker/setup-buildx-action@v3 + + - name: Login to GHCR + if: inputs.push == 'true' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_PREFIX }}-cpp + tags: | + type=ref,event=branch + type=sha,prefix=sha- + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build & push + uses: docker/build-push-action@v6 + with: + context: . + file: .devops/Dockerfile.cpp + platforms: linux/amd64,linux/arm64 + push: ${{ inputs.push == 'true' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha,scope=cpp + cache-to: type=gha,mode=max,scope=cpp + + + build-cpu-image: + name: Build — cpu + if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cpu' || inputs.image == 'all') + runs-on: ubuntu-latest + permissions: + contents: read + packages: write steps: - - name: Check out repository - uses: actions/checkout@v4 + - uses: actions/checkout@v4 + + - uses: docker/setup-qemu-action@v3 + - uses: docker/setup-buildx-action@v3 + + - name: Login to GHCR + if: inputs.push == 'true' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - - name: Set up Node.js - uses: actions/setup-node@v4 + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 with: - node-version: "20" - cache: "npm" - cache-dependency-path: frontend/package-lock.json + images: ${{ env.IMAGE_PREFIX }}-cpu + tags: | + type=ref,event=branch + type=sha,prefix=sha- + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build & push + uses: docker/build-push-action@v6 + with: + context: . + file: .devops/Dockerfile + platforms: linux/amd64,linux/arm64 + push: ${{ inputs.push == 'true' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha,scope=cpu + cache-to: type=gha,mode=max,scope=cpu + + + build-cuda-image: + name: Build — cuda + if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cuda' || inputs.image == 'all') + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - uses: actions/checkout@v4 - - name: Install frontend dependencies - working-directory: frontend - run: npm ci + - uses: docker/setup-buildx-action@v3 - - name: Build frontend - working-directory: frontend - run: npm run build + - name: Login to GHCR + if: inputs.push == 'true' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_PREFIX }}-cuda + tags: | + type=ref,event=branch + type=sha,prefix=sha- + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build & push + uses: docker/build-push-action@v6 + with: + context: . + file: .devops/Dockerfile.backend + platforms: linux/amd64 + push: ${{ inputs.push == 'true' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha,scope=cuda + cache-to: type=gha,mode=max,scope=cuda \ No newline at end of file diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 1431739..ca9493f 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -1,73 +1,132 @@ -name: Publish Docker image +name: Release + on: - workflow_dispatch: -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + workflow_dispatch: + inputs: + version: + description: "Version tag (e.g. 1.2.3)" + required: true + env: REGISTRY: ghcr.io + IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/quadtrix + jobs: - build-and-push: - name: Build & push (${{ matrix.variant }}) - runs-on: ubuntu-latest - permissions: - contents: read - packages: write + + build-binaries: + name: Binary (${{ matrix.os }}) + runs-on: ${{ matrix.os }} strategy: - fail-fast: false matrix: + os: [ubuntu-22.04, macos-14] include: - - variant: cpu - dockerfile: Dockerfile - tag_suffix: "" - - variant: cuda - dockerfile: Dockerfile.cuda - tag_suffix: "-cuda" + - os: ubuntu-22.04 + artifact_name: quadtrix-linux-x64 + binary: quadtrix + - os: macos-14 + artifact_name: quadtrix-macos-arm64 + binary: quadtrix steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Set lowercase image name - id: image + - uses: actions/checkout@v4 + + - name: Compile (Linux) + if: runner.os == 'Linux' + run: | + sudo apt-get update && sudo apt-get install -y g++ + g++ -std=c++17 -O3 -march=native \ + -I. -Iinclude \ + -o ${{ matrix.binary }} main.cpp + strip ${{ matrix.binary }} + + - name: Compile (macOS) + if: runner.os == 'macOS' + run: | + g++ -std=c++17 -O3 -march=native \ + -I. -Iinclude \ + -o ${{ matrix.binary }} main.cpp + + - name: Package run: | - echo "name=$(echo '${{ github.repository }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Log in to ghcr.io + mkdir dist + cp ${{ matrix.binary }} dist/ + cp README.md LICENSE dist/ + tar -czf ${{ matrix.artifact_name }}.tar.gz -C dist . + + - name: Upload to Release + uses: softprops/action-gh-release@v2 + with: + tag_name: v${{ github.event.inputs.version }} + files: ${{ matrix.artifact_name }}.tar.gz + generate_release_notes: true + + publish-images: + name: Publish Docker images + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - uses: actions/checkout@v4 + + - uses: docker/setup-qemu-action@v3 + - uses: docker/setup-buildx-action@v3 + + - name: Login to GHCR uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Extract Docker metadata - id: meta - uses: docker/metadata-action@v5 + + - name: Parse tag + id: tag + run: echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT + + - name: Build & push backend + uses: docker/build-push-action@v6 with: - images: ${{ env.REGISTRY }}/${{ steps.image.outputs.name }} + context: . + file: .devops/Dockerfile.backend + platforms: linux/amd64,linux/arm64 + push: true tags: | - type=raw,value=latest${{ matrix.tag_suffix }},enable={{is_default_branch}} - type=semver,pattern={{version}},suffix=${{ matrix.tag_suffix }} - type=semver,pattern={{major}}.{{minor}},suffix=${{ matrix.tag_suffix }} - type=ref,event=pr,suffix=${{ matrix.tag_suffix }} - - name: Free disk space - if: matrix.variant == 'cuda' - run: | - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf /usr/local/share/boost - df -h - - name: Build and push Docker image + ${{ env.IMAGE_PREFIX }}-backend:latest + ${{ env.IMAGE_PREFIX }}-backend:${{ steps.tag.outputs.VERSION }} + cache-from: type=gha,scope=backend + cache-to: type=gha,mode=max,scope=backend + + - name: Build & push frontend uses: docker/build-push-action@v6 with: context: . - file: ./${{ matrix.dockerfile }} + file: .devops/Dockerfile.frontend + platforms: linux/amd64,linux/arm64 push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha,scope=${{ matrix.variant }} - cache-to: type=gha,mode=max,scope=${{ matrix.variant }} - - name: Image published + tags: | + ${{ env.IMAGE_PREFIX }}-frontend:latest + ${{ env.IMAGE_PREFIX }}-frontend:${{ steps.tag.outputs.VERSION }} + cache-from: type=gha,scope=frontend + cache-to: type=gha,mode=max,scope=frontend + + - name: Build & push cpp + uses: docker/build-push-action@v6 + with: + context: . + file: .devops/Dockerfile.cpp + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.IMAGE_PREFIX }}-cpp:latest + ${{ env.IMAGE_PREFIX }}-cpp:${{ steps.tag.outputs.VERSION }} + cache-from: type=gha,scope=cpp + cache-to: type=gha,mode=max,scope=cpp + + - name: Create Release summary run: | - echo "[${{ matrix.variant }}] published:" - echo " docker pull ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest${{ matrix.tag_suffix }}" + echo "## Docker images published" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Image | Tags |" >> $GITHUB_STEP_SUMMARY + echo "|-------|------|" >> $GITHUB_STEP_SUMMARY + echo "| \`quadtrix-backend\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| \`quadtrix-frontend\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| \`quadtrix-cpp\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml new file mode 100644 index 0000000..c52ae09 --- /dev/null +++ b/.github/workflows/pr-check.yml @@ -0,0 +1,238 @@ +name: PR Checks + +on: + issue_comment: + types: [created] + +jobs: + slash-command: + name: Parse /run-checks + if: | + github.event.issue.pull_request != null && + contains(github.event.comment.body, '/run-checks') + runs-on: ubuntu-latest + outputs: + pr-sha: ${{ steps.get-sha.outputs.sha }} + steps: + - name: Check commenter permission + uses: actions/github-script@v7 + with: + script: | + const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ + owner: context.repo.owner, + repo: context.repo.repo, + username: context.actor, + }); + if (!['admin', 'write'].includes(data.permission)) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: `@${context.actor} Only maintainers can trigger checks.`, + }); + core.setFailed('Unauthorized'); + } + + - name: React with rocket + uses: actions/github-script@v7 + with: + script: | + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: ${{ github.event.comment.id }}, + content: 'rocket', + }); + + - name: Get PR head SHA + id: get-sha + uses: actions/github-script@v7 + with: + script: | + const { data: pr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.issue.number, + }); + core.setOutput('sha', pr.head.sha); + + + lint: + name: Lint + needs: slash-command + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ needs.slash-command.outputs.pr-sha }} + + - name: C++ format check + run: | + sudo apt-get install -y clang-format + find . -name "*.cpp" -o -name "*.h" | grep -v "build/" | \ + xargs clang-format --dry-run --Werror --style=LLVM || true + + - name: Python lint (ruff) + uses: chartboost/ruff-action@v1 + with: + args: "check engine/ --ignore E501 --exit-zero" + + - name: TypeScript lint (eslint) + working-directory: frontend + run: | + npm ci --prefer-offline + npx eslint src/ --ext .ts,.tsx --max-warnings 20 || true + + + build-cpp: + name: Build C++ (${{ matrix.os }}) + needs: slash-command + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, ubuntu-24.04, macos-14] + include: + - os: ubuntu-22.04 + artifact: quadtrix-linux-x64 + - os: ubuntu-24.04 + artifact: quadtrix-linux-x64-noble + - os: macos-14 + artifact: quadtrix-macos-arm64 + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ needs.slash-command.outputs.pr-sha }} + + - name: Install GCC (Linux) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y g++ ccache + + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ~/.ccache + key: ccache-${{ matrix.os }}-${{ hashFiles('**/*.cpp', '**/*.h') }} + restore-keys: ccache-${{ matrix.os }}- + + - name: Compile main.cpp + run: | + g++ -std=c++17 -O3 -march=native \ + -I. -Iinclude \ + -o quadtrix main.cpp + + - name: Smoke test + run: ./quadtrix --help || true + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.artifact }} + path: quadtrix + retention-days: 7 + + + validate-dockerfiles: + name: Validate Dockerfiles + needs: slash-command + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ needs.slash-command.outputs.pr-sha }} + + + - name: Check required files exist + run: | + echo "Checking files referenced by Dockerfiles..." + files=( + "main.cpp" + "engine/main.py" + "requirements.txt" + ) + failed=0 + for f in "${files[@]}"; do + if [ -f "$f" ]; then + echo "✅ $f" + else + echo "❌ $f — MISSING" + failed=1 + fi + done + exit $failed + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build check — Dockerfile.cpp (C++ engine) + uses: docker/build-push-action@v6 + with: + context: . + file: .devops/Dockerfile.cpp + platforms: linux/amd64 + push: false + cache-from: type=gha,scope=cpp + cache-to: type=gha,mode=max,scope=cpp + + + - name: Build check — Dockerfile (PyTorch CPU) + uses: docker/build-push-action@v6 + with: + context: . + file: .devops/Dockerfile + platforms: linux/amd64 + push: false + cache-from: type=gha,scope=cpu + cache-to: type=gha,mode=max,scope=cpu + + - name: Skip CUDA build check + run: echo "CUDA build skipped on PR checks — run publish-docker workflow to build cuda image." + + + test-frontend: + name: Frontend Tests + needs: [slash-command, lint] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ needs.slash-command.outputs.pr-sha }} + + - uses: actions/setup-node@v4 + with: + node-version: "20" + cache: npm + cache-dependency-path: frontend/package-lock.json + + - name: Install + working-directory: frontend + run: npm ci --prefer-offline + + - name: Type-check + working-directory: frontend + run: npx tsc --noEmit + + - name: Build check + working-directory: frontend + run: npm run build + + + post-result: + name: Post result + needs: [slash-command, lint, build-cpp, validate-dockerfiles, test-frontend] + runs-on: ubuntu-latest + if: always() + steps: + - uses: actions/github-script@v7 + with: + script: | + const jobs = ${{ toJSON(needs) }}; + const failed = Object.values(jobs).some(j => j.result === 'failure'); + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: failed + ? ' Some checks failed — see Actions for details.' + : ' All checks passed!', + }); \ No newline at end of file diff --git a/CUDA/main.cu b/CUDA/main.cu new file mode 100644 index 0000000..4b24fec --- /dev/null +++ b/CUDA/main.cu @@ -0,0 +1,2070 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llmcpp/utils.h" + +#include "llmcpp/tokenizer.h" + +#include "llmcpp/dataloader.h" + +#include "llmcpp/rand.h" + +#include "llmcpp/schedulers.h" + +#include "llmcpp/sampler.h" + +#include "llmcpp/logger.h" + +#include "llmcpp/mfu.h" + +#include "llmcpp/outlier_detector.h" + +#include "llmcpp/cuda_common.h" + +#include "llmcpp/cuda_utils.cuh" + +#include "llmcpp/cublas_common.h" + +#include "llmcpp/encoder.cuh" + +#include "llmcpp/layernorm.cuh" + +#include "llmcpp/matmul.cuh" +#ifdef ENABLE_CUDNN + +#include "llmcpp/cudnn_att.h" +#else + +#include "llmcpp/attention.cuh" +#endif + +#include "llmcpp/fused_classifier.cuh" + +#include "llmcpp/adamw.cuh" + +#include "llmcpp/global_norm.cuh" + +#include "llmcpp/zero.cuh" + +char filename_buffer[512]; + +cudaDeviceProp deviceProp; +cudaStream_t main_stream; + +constexpr const size_t IO_BUF_SIZE = 32 * 1024 * 1024; + +typedef struct +{ + int max_seq_len; + int vocab_size; + int padded_vocab_size; + int num_layers; + int num_heads; + int channels; +} GPT2Config; + +constexpr const int NUM_PARAMETER_TENSORS = 16; +typedef struct +{ + floatX *wte; + floatX *wpe; + floatX *ln1w; + floatX *ln1b; + floatX *qkvw; + floatX *qkvb; + floatX *attprojw; + floatX *attprojb; + floatX *ln2w; + floatX *ln2b; + floatX *fcw; + floatX *fcb; + floatX *fcprojw; + floatX *fcprojb; + floatX *lnfw; + floatX *lnfb; +} ParameterTensors; +static_assert(sizeof(ParameterTensors) == NUM_PARAMETER_TENSORS * sizeof(void *), "Inconsistent sizes!"); + +void fill_in_parameter_sizes(size_t *param_sizes, size_t *param_sizeof, GPT2Config config) +{ + size_t Vp = config.padded_vocab_size; + size_t C = config.channels; + size_t maxT = config.max_seq_len; + size_t L = config.num_layers; + param_sizes[0] = Vp * C; + param_sizes[1] = maxT * C; + param_sizes[2] = L * C; + param_sizes[3] = L * C; + param_sizes[4] = L * (3 * C) * C; + param_sizes[5] = L * (3 * C); + param_sizes[6] = L * C * C; + param_sizes[7] = L * C; + param_sizes[8] = L * C; + param_sizes[9] = L * C; + param_sizes[10] = L * (4 * C) * C; + param_sizes[11] = L * (4 * C); + param_sizes[12] = L * C * (4 * C); + param_sizes[13] = L * C; + param_sizes[14] = C; + param_sizes[15] = C; + + for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) + { + param_sizeof[i] = sizeof(floatX); + } +} + +void *malloc_and_point_parameters(ParameterTensors *params, size_t *param_elements, size_t *param_sizeof) +{ + + size_t num_parameters_bytes = 0; + for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) + { + num_parameters_bytes += param_elements[i] * param_sizeof[i]; + } + + void *params_memory; + cudaCheck(cudaMalloc((void **)¶ms_memory, num_parameters_bytes)); + + floatX **ptrs[] = { + ¶ms->wte, ¶ms->wpe, ¶ms->ln1w, ¶ms->ln1b, ¶ms->qkvw, ¶ms->qkvb, + ¶ms->attprojw, ¶ms->attprojb, ¶ms->ln2w, ¶ms->ln2b, ¶ms->fcw, ¶ms->fcb, + ¶ms->fcprojw, ¶ms->fcprojb, ¶ms->lnfw, ¶ms->lnfb}; + char *params_memory_iterator = (char *)params_memory; + for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) + { + *(ptrs[i]) = (floatX *)params_memory_iterator; + params_memory_iterator += param_elements[i] * param_sizeof[i]; + } + return params_memory; +} + +constexpr int NUM_ACTIVATION_TENSORS = 21; +typedef struct +{ + floatX *encoded; + floatX *ln1; + float *ln1_mean; + float *ln1_rstd; + floatX *atty; + +#if ENABLE_CUDNN + float *att; +#else + floatX *att; +#endif + + floatX *residual2; + floatX *ln2; + float *ln2_mean; + float *ln2_rstd; + floatX *fch; + floatX *fch_gelu; + floatX *residual3; + floatX *lnf; + float *lnf_mean; + float *lnf_rstd; + float *losses; + + floatX *qkvr; + + floatX *output; + + floatX *scratch_bt4c; + floatX *scratch_btc; +} ActivationTensors; + +struct TensorSpec +{ + void **ptr; + size_t size; + DType type; +}; + +#define TENSOR_SPEC(pointer, size) TensorSpec{(void **)(&pointer), (size), dtype_of(pointer)}; + +void fill_in_activation_sizes(const ActivationTensors *data, TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS], size_t B, size_t T, GPT2Config config, int recompute) +{ + size_t Vp = config.padded_vocab_size; + size_t L = config.num_layers; + size_t NH = config.num_heads; + size_t C = config.channels; + tensors[0] = TENSOR_SPEC(data->encoded, B * T * C); + + tensors[1] = TENSOR_SPEC(data->ln1, (recompute < 2) ? L * B * T * C : 0); + tensors[2] = TENSOR_SPEC(data->ln1_mean, L * B * T); + tensors[3] = TENSOR_SPEC(data->ln1_rstd, L * B * T); + tensors[4] = TENSOR_SPEC(data->atty, L * B * T * C); +#ifdef ENABLE_CUDNN + + tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T); +#else + tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T * T); +#endif + tensors[6] = TENSOR_SPEC(data->residual2, L * B * T * C); + + tensors[7] = TENSOR_SPEC(data->ln2, (recompute < 2) ? L * B * T * C : 0); + tensors[8] = TENSOR_SPEC(data->ln2_mean, L * B * T); + tensors[9] = TENSOR_SPEC(data->ln2_rstd, L * B * T); + tensors[10] = TENSOR_SPEC(data->fch, L * B * T * 4 * C); + + tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * 4 * C : B * T * 4 * C); + tensors[12] = TENSOR_SPEC(data->residual3, L * B * T * C); + tensors[13] = TENSOR_SPEC(data->lnf, B * T * C); + tensors[14] = TENSOR_SPEC(data->lnf_mean, B * T); + tensors[15] = TENSOR_SPEC(data->lnf_rstd, B * T); + tensors[16] = TENSOR_SPEC(data->losses, B * T); + tensors[17] = TENSOR_SPEC(data->qkvr, L * B * T * 3 * C); + tensors[18] = TENSOR_SPEC(data->output, B * T * max(3 * C, max(NH * T, Vp))); + + tensors[19] = TENSOR_SPEC(data->scratch_bt4c, B * T * 4 * C); + tensors[20] = TENSOR_SPEC(data->scratch_btc, B * T * C); +} + +void *malloc_and_point_activations(TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS]) +{ + size_t bytes = 0; + for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) + { + bytes += tensors[i].size * sizeof_dtype(tensors[i].type); + } + + printf0("allocating %d MiB for activations\n", (int)round(bytes / (1024 * 1024))); + + void *acts_memory; + cudaCheck(cudaMalloc((void **)&acts_memory, bytes)); + + cudaCheck(cudaMemset(acts_memory, 0, bytes)); + + char *acts_memory_iterator = (char *)acts_memory; + for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) + { + + if (tensors[i].size == 0) + { + *(tensors[i].ptr) = NULL; + } + else + { + *(tensors[i].ptr) = acts_memory_iterator; + acts_memory_iterator += tensors[i].size * sizeof_dtype(tensors[i].type); + } + } + return acts_memory; +} + +typedef struct +{ + GPT2Config config; + + ParameterTensors params; + size_t param_elements[NUM_PARAMETER_TENSORS]; + size_t param_sizeof[NUM_PARAMETER_TENSORS]; + void *params_memory; + size_t num_parameters; + size_t num_parameters_bytes; + + ParameterTensors grads; + void *grads_memory; + + float *m_memory; + float *v_memory; + float *master_weights; + + ActivationTensors acts; + TensorSpec acts_specs[NUM_ACTIVATION_TENSORS]; + void *acts_memory; + + int batch_size; + int seq_len; + int *inputs; + int *targets; + float mean_loss; + float *accumulated_mean_loss; + float *cpu_losses; + unsigned long long rng_state; + unsigned long long rng_state_last_update; + int use_master_weights; + bool init_state; + int gelu_fusion; + int recompute; + + int *workload_indices; + int4 *bucket_info; +} GPT2; + +void gpt2_init_common(GPT2 *model) +{ + + model->acts_memory = NULL; + model->inputs = NULL; + model->targets = NULL; + model->accumulated_mean_loss = NULL; + model->cpu_losses = NULL; + + model->batch_size = 0; + model->seq_len = 0; + model->mean_loss = -1.0f; + model->params_memory = NULL; + + model->grads_memory = NULL; + model->workload_indices = NULL; + model->bucket_info = NULL; + + model->m_memory = NULL; + model->v_memory = NULL; + model->master_weights = NULL; + + model->rng_state = 13371337 + multi_gpu_config.process_rank; + model->use_master_weights = 1; + model->init_state = true; + model->recompute = 1; + model->gelu_fusion = 0; +} + +void gpt2_allocate_weights(GPT2 *model) +{ + + fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config); + model->num_parameters = 0; + model->num_parameters_bytes = 0; + for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) + { + model->num_parameters += model->param_elements[i]; + model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i]; + } + + assert(model->params_memory == nullptr); + model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof); +} + +void gpt2_allocate_state(GPT2 *model, int B, int T) +{ + printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024))); + assert(model->grads_memory == nullptr); + model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof); + + model->batch_size = B; + model->seq_len = T; + + fill_in_activation_sizes(&model->acts, model->acts_specs, B, T, model->config, model->recompute); + model->acts_memory = malloc_and_point_activations(model->acts_specs); + + cudaCheck(cudaMalloc((void **)&model->inputs, B * T * sizeof(int))); + cudaCheck(cudaMalloc((void **)&model->targets, B * T * sizeof(int))); + cudaCheck(cudaMalloc(((void **)&model->accumulated_mean_loss), sizeof(float))); + cudaCheck(cudaMallocHost((void **)&model->cpu_losses, B * T * sizeof(float))); + + size_t num_c_groups = CEIL_DIV(model->config.channels, (WARP_SIZE * x128::size)); + assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL << 31ULL)); + model->workload_indices = (int *)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups); + model->bucket_info = (int4 *)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups); + + int memory_status = 0; + + size_t shard_num_parameters = multi_gpu_config.shard_num_parameters; + printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20); + printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20); + assert(model->m_memory == nullptr); + assert(model->v_memory == nullptr); + memory_status |= cudaMallocConditionallyManaged((void **)&model->m_memory, shard_num_parameters * sizeof(float)); + memory_status |= cudaMallocConditionallyManaged((void **)&model->v_memory, shard_num_parameters * sizeof(float)); + + if (model->use_master_weights == 1) + { + assert(model->master_weights == nullptr); + printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20); + memory_status |= cudaMallocConditionallyManaged((void **)&model->master_weights, shard_num_parameters * sizeof(float)); + } + + int reduced_memory_status = (int)multi_gpu_cpu_float_sum((float)memory_status, &multi_gpu_config); + if (reduced_memory_status >= 1) + { + printf0("WARNING: Fell back to cudaMallocManaged when initializing m,v,master_weights on %d GPUs\n", reduced_memory_status); + printf0(" Prevents an OOM, but code may run much slower due to device <-> host memory movement\n"); + } + + size_t free, total; + cudaCheck(cudaMemGetInfo(&free, &total)); + printf0("device memory usage: %zd MiB / %zd MiB\n", (total - free) / 1024 / 1024, total / 1024 / 1024); + + size_t bytes_per_sequence = 0; + for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) + { + bytes_per_sequence += model->acts_specs[i].size * sizeof_dtype(model->acts_specs[i].type) / B; + } + printf0("memory per sequence: %zu MiB\n", bytes_per_sequence / 1024 / 1024); + printf0(" -> estimated maximum batch size: %zu\n", B + free / bytes_per_sequence); +} + +void gpt2_write_to_checkpoint(GPT2 *model, const char *checkpoint_path) +{ + + printf0("Writing model to %s\n", checkpoint_path); + FILE *model_file = fopenCheck(checkpoint_path, "wb"); + + int model_header[256]; + memset(model_header, 0, sizeof(model_header)); + model_header[0] = 20240326; + assert(PRECISION_MODE == PRECISION_FP32 || PRECISION_MODE == PRECISION_BF16); + model_header[1] = PRECISION_MODE == PRECISION_FP32 ? 3 : 5; + model_header[2] = model->config.max_seq_len; + model_header[3] = model->config.vocab_size; + model_header[4] = model->config.num_layers; + model_header[5] = model->config.num_heads; + model_header[6] = model->config.channels; + model_header[7] = model->config.padded_vocab_size; + fwriteCheck(model_header, sizeof(int), 256, model_file); + + device_to_file(model_file, model->params_memory, model->num_parameters_bytes, + IO_BUF_SIZE, main_stream); + + fcloseCheck(model_file); +} + +void gpt2_build_from_checkpoint(GPT2 *model, const char *checkpoint_path, bool weight_init = true) +{ + + if (PRECISION_MODE == PRECISION_FP16) + { + + fprintf(stderr, "build_from_checkpoint() does not support fp16 right now.\n"); + exit(EXIT_FAILURE); + } + + FILE *model_file = fopenCheck(checkpoint_path, "rb"); + int model_header[256]; + freadCheck(model_header, sizeof(int), 256, model_file); + if (model_header[0] != 20240326) + { + printf("Bad magic model file\n"); + exit(EXIT_FAILURE); + } + int version = model_header[1]; + if (!(version == 3 || version == 5)) + { + + fprintf(stderr, "Bad version in model file\n"); + fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n"); + exit(EXIT_FAILURE); + } + + if (weight_init) + { + if (PRECISION_MODE == PRECISION_BF16 && version != 5) + { + fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path); + fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n"); + exit(EXIT_FAILURE); + } + if (PRECISION_MODE == PRECISION_FP32 && version != 3) + { + fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path); + fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_gpt2cu PRECISION=FP32`\n"); + fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n"); + exit(EXIT_FAILURE); + } + } + + model->config.max_seq_len = model_header[2]; + model->config.vocab_size = model_header[3]; + model->config.num_layers = model_header[4]; + model->config.num_heads = model_header[5]; + model->config.channels = model_header[6]; + model->config.padded_vocab_size = model_header[7]; + + gpt2_allocate_weights(model); + + if (weight_init) + { + assert(model->params_memory != NULL); + file_to_device(model->params_memory, model_file, model->num_parameters_bytes, IO_BUF_SIZE, main_stream); + } + fcloseCheck(model_file); + + cudaCheck(cudaDeviceSynchronize()); +} + +void gpt2_set_hyperparameters(GPT2Config *config, const char *depth_str) +{ + int depth = atoi(depth_str); + assert(depth > 0); + int channels, num_heads; + if (depth == 6) + { + channels = 384; + num_heads = 6; + } + else if (depth == 12) + { + channels = 768; + num_heads = 12; + } + else if (depth == 24) + { + channels = 1024; + num_heads = 16; + } + else if (depth == 36) + { + channels = 1280; + num_heads = 20; + } + else if (depth == 48) + { + channels = 1600; + num_heads = 25; + } + else if (depth == 60) + { + channels = 1920; + num_heads = 30; + } + else if (depth == 72) + { + channels = 2880; + num_heads = 30; + } + else if (depth == 84) + { + channels = 3456; + num_heads = 36; + } + else + { + fprintf(stderr, "Unsupported GPT-2 depth: %d\n", depth); + exit(EXIT_FAILURE); + } + config->num_layers = depth; + config->channels = channels; + config->num_heads = num_heads; + config->max_seq_len = 1024; +} + +void gpt3_set_hyperparameters(GPT2Config *config, const char *channels_str) +{ + + int channels = atoi(channels_str); + assert(channels > 0); + int depth, head_size; + if (channels == 384) + { + depth = 6; + head_size = 64; + } + else if (channels == 768) + { + depth = 12; + head_size = 64; + } + else if (channels == 1024) + { + depth = 24; + head_size = 64; + } + else if (channels == 1536) + { + depth = 24; + head_size = 96; + } + else if (channels == 2048) + { + depth = 24; + head_size = 128; + } + else if (channels == 2560) + { + depth = 32; + head_size = 80; + } + else if (channels == 4096) + { + depth = 32; + head_size = 128; + } + else if (channels == 5140) + { + depth = 40; + head_size = 128; + } + else if (channels == 12288) + { + depth = 96; + head_size = 128; + } + else + { + fprintf(stderr, "Unsupported GPT-3 channels: %d\n", channels); + exit(EXIT_FAILURE); + } + assert(channels % head_size == 0); + config->num_layers = depth; + config->channels = channels; + config->num_heads = channels / head_size; + config->max_seq_len = 2048; +} + +void gpt_build_from_descriptor(GPT2 *model, const char *descriptor) +{ + + assert(descriptor != NULL); + size_t len = strlen(descriptor); + if (len > 1 && descriptor[0] == 'd') + { + gpt2_set_hyperparameters(&model->config, descriptor + 1); + } + else if (len > 6 && strncmp(descriptor, "gpt2:d", 6) == 0) + { + gpt2_set_hyperparameters(&model->config, descriptor + 6); + } + else if (len > 6 && strncmp(descriptor, "gpt3:c", 6) == 0) + { + gpt3_set_hyperparameters(&model->config, descriptor + 6); + } + else + { + fprintf(stderr, "Unsupported model descriptor: %s\n", descriptor); + exit(EXIT_FAILURE); + } + + model->config.vocab_size = 50257; + model->config.padded_vocab_size = 50304; + + gpt2_allocate_weights(model); + + mt19937_state init_rng; + manual_seed(&init_rng, 42); + floatX *params_memory_cpu = (floatX *)mallocCheck(model->num_parameters_bytes); + memset(params_memory_cpu, 0, model->num_parameters_bytes); + + float residual_scale = 1.0f / sqrtf(2.0f * model->config.num_layers); + + size_t L = model->config.num_layers; + size_t offset = 0; + for (int l = 0; l < L; l++) + { + offset = 0; + for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) + { + + if (l == 0 && (i == 2 || i == 8 || i == 14)) + { + for (size_t j = 0; j < model->param_elements[i]; j++) + { + params_memory_cpu[offset + j] = 1.0f; + } + } + + if ((l == 0 && (i == 0 || i == 1)) || i == 4 || i == 6 || i == 10 || i == 12) + { + size_t n = model->param_elements[i]; + size_t layer_offset = 0; + if (i == 0) + { + + n = model->config.vocab_size * model->config.channels; + } + if (i == 4 || i == 6 || i == 10 || i == 12) + { + + assert(n % L == 0); + n = n / L; + layer_offset = l * n; + } + + float scale = (i == 6 || i == 12) ? 0.02f * residual_scale : 0.02f; + + float *fp32_buffer = (float *)mallocCheck(n * sizeof(float)); + normal_(fp32_buffer, n, 0.0f, scale, &init_rng); + for (size_t j = 0; j < n; j++) + { + params_memory_cpu[offset + layer_offset + j] = (floatX)fp32_buffer[j]; + } + free(fp32_buffer); + } + offset += model->param_elements[i]; + } + } + + cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice)); + free(params_memory_cpu); +} + +void gpt2_forward(GPT2 *model, const int *inputs, size_t B, size_t T) +{ + NVTX_RANGE_FN(); + + if (model->params_memory == NULL) + { + printf("Error: model was not initialized properly.\n"); + exit(EXIT_FAILURE); + } + + const size_t V = model->config.vocab_size; + const size_t Vp = model->config.padded_vocab_size; + const size_t L = model->config.num_layers; + const size_t NH = model->config.num_heads; + const size_t C = model->config.channels; + + if (B > model->batch_size || T > model->seq_len) + { + printf("Model: B=%d T=%d, Desired: B=%d T=%d\n", model->batch_size, model->seq_len, (int)B, (int)T); + exit(EXIT_FAILURE); + } + + cudaCheck(cudaMemcpy(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice)); + + tokenCheck(inputs, B * T, V); + + ParameterTensors params = model->params; + ActivationTensors acts = model->acts; + encoder_forward(acts.encoded, model->inputs, params.wte, params.wpe, B, T, C, main_stream); + + layernorm_forward((model->recompute < 2) ? acts.ln1 : acts.lnf, acts.ln1_mean, acts.ln1_rstd, acts.encoded, params.ln1w, params.ln1b, B, T, C, main_stream); + + for (int l = 0; l < L; l++) + { + NvtxRange layer_range("Layer", l); + + floatX *residual = l == 0 ? acts.encoded : acts.residual3 + (l - 1) * B * T * C; + + floatX *l_qkvw = params.qkvw + l * 3 * C * C; + floatX *l_qkvb = params.qkvb + l * 3 * C; + floatX *l_attprojw = params.attprojw + l * C * C; + floatX *l_attprojb = params.attprojb + l * C; + floatX *l_ln2w = params.ln2w + l * C; + floatX *l_ln2b = params.ln2b + l * C; + floatX *l_fcw = params.fcw + l * 4 * C * C; + floatX *l_fcb = params.fcb + l * 4 * C; + floatX *l_fcprojw = params.fcprojw + l * C * 4 * C; + floatX *l_fcprojb = params.fcprojb + l * C; + + floatX *l_ln1 = (model->recompute < 2) ? acts.ln1 + l * B * T * C : acts.lnf; + floatX *l_qkvr = acts.qkvr + l * B * T * 3 * C; + floatX *l_atty = acts.atty + l * B * T * C; + floatX *l_residual2 = acts.residual2 + l * B * T * C; + floatX *l_ln2 = (model->recompute < 2) ? acts.ln2 + l * B * T * C : acts.lnf; + float *l_ln2_mean = acts.ln2_mean + l * B * T; + float *l_ln2_rstd = acts.ln2_rstd + l * B * T; + floatX *l_fch = acts.fch + l * B * T * 4 * C; + + floatX *l_fch_gelu = (model->recompute < 1) ? acts.fch_gelu + l * B * T * 4 * C : acts.fch_gelu; + floatX *l_residual3 = acts.residual3 + l * B * T * C; + floatX *scratch = (floatX *)acts.output; + +#ifdef ENABLE_CUDNN + float *l_att = (float *)acts.att + l * B * NH * T; + matmul_forward_cublaslt(l_qkvr, l_ln1, l_qkvw, l_qkvb, B, T, C, 3 * C, main_stream); + attention_forward_cudnn(l_atty, (float *)l_att, l_qkvr, B, T, NH, C, main_stream); +#else + floatX *l_att = acts.att + l * B * NH * T * T; + if (T != model->seq_len) + { + cudaCheck(cudaMemset(l_att, 0, B * NH * T * T * sizeof(floatX))); + } + + matmul_forward_cublaslt(scratch, l_ln1, l_qkvw, l_qkvb, B, T, C, 3 * C, main_stream); + attention_forward(l_atty, l_qkvr, l_att, scratch, B, T, C, NH, main_stream); +#endif + + matmul_forward_cublaslt(scratch, l_atty, l_attprojw, l_attprojb, B, T, C, C, main_stream); + fused_residual_forward5(l_residual2, l_ln2, l_ln2_mean, l_ln2_rstd, residual, scratch, l_ln2w, l_ln2b, B * T, C, main_stream); + matmul_forward_cublaslt(l_fch_gelu, l_ln2, l_fcw, l_fcb, B, T, C, 4 * C, main_stream, l_fch, model->gelu_fusion); + matmul_forward_cublaslt(scratch, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4 * C, C, main_stream); + + if (l + 1 != L) + { + floatX *l_ln1 = (model->recompute < 2) ? acts.ln1 + (l + 1) * B * T * C : acts.lnf; + float *l_ln1_mean = acts.ln1_mean + (l + 1) * B * T; + float *l_ln1_rstd = acts.ln1_rstd + (l + 1) * B * T; + const floatX *l_ln1w = params.ln1w + (l + 1) * C; + const floatX *l_ln1b = params.ln1b + (l + 1) * C; + fused_residual_forward5(l_residual3, l_ln1, l_ln1_mean, l_ln1_rstd, l_residual2, scratch, l_ln1w, l_ln1b, + B * T, C, main_stream); + } + else + { + fused_residual_forward5(l_residual3, acts.lnf, acts.lnf_mean, acts.lnf_rstd, l_residual2, scratch, + params.lnfw, params.lnfb, + B * T, C, main_stream); + } + } + + matmul_forward_cublaslt(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp, main_stream); + cudaCheck(cudaDeviceSynchronize()); +} + +float gpt2_validate(GPT2 *model, const int *inputs, const int *targets, size_t B, size_t T) +{ + assert(targets != NULL); + + gpt2_forward(model, inputs, B, T); + + const size_t V = model->config.vocab_size; + const size_t Vp = model->config.padded_vocab_size; + + NvtxRange classifier_and_loss_range("classifier_and_loss"); + ActivationTensors acts = model->acts; + float mean_loss = 0.0f; + + const float dloss = 1.0f / (B * T); + + cudaCheck(cudaMemset(acts.losses, 0, B * T * sizeof(float))); + cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice)); + tokenCheck(targets, B * T, V); + fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp, False, main_stream); + cudaCheck(cudaMemcpy(model->cpu_losses, acts.losses, B * T * sizeof(float), cudaMemcpyDeviceToHost)); + for (int i = 0; i < B * T; i++) + { + mean_loss += model->cpu_losses[i]; + } + mean_loss /= B * T; + cudaCheck(cudaDeviceSynchronize()); + return mean_loss; +} + +void gpt2_backward_and_reduce(GPT2 *model, int *inputs, const int *targets, int grad_accum_steps, int micro_step) +{ + if (model->grads_memory == nullptr) + { + fprintf(stderr, "Need to allocate gradients before backward"); + exit(EXIT_FAILURE); + } + NVTX_RANGE_FN(); + bool last_step = micro_step == grad_accum_steps - 1; + + if (micro_step == 0) + { + + cudaCheck(cudaMemsetAsync(model->acts.losses, 0, model->batch_size * model->seq_len * sizeof(float), main_stream)); + cudaCheck(cudaMemsetAsync(model->grads_memory, 0, model->num_parameters * sizeof(floatX), main_stream)); + } + + const size_t B = model->batch_size; + const size_t T = model->seq_len; + const size_t V = model->config.vocab_size; + const size_t Vp = model->config.padded_vocab_size; + const size_t L = model->config.num_layers; + const size_t NH = model->config.num_heads; + const size_t C = model->config.channels; + + ParameterTensors params = model->params; + ParameterTensors grads = model->grads; + ActivationTensors acts = model->acts; + + NvtxRange classifier_and_loss_range("classifier_and_loss"); + const float dloss = 1.0f / (float)(B * T * grad_accum_steps); + cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice)); + tokenCheck(targets, B * T, V); + fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp, True, main_stream); + + floatX *dresidual = (floatX *)model->acts.scratch_btc; + cudaCheck(cudaMemset(dresidual, 0, B * T * C * sizeof(floatX))); + + float *scratchF = (float *)acts.output; + floatX *scratchX = (floatX *)acts.output; + + matmul_backward(model->acts.scratch_bt4c, grads.wte, NULL, acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp, main_stream); + + floatX *residual = acts.residual3 + (L - 1) * B * T * C; + layernorm_backward(dresidual, grads.lnfw, grads.lnfb, scratchF, model->acts.scratch_bt4c, residual, params.lnfw, acts.lnf_mean, acts.lnf_rstd, B, T, C, main_stream); + + floatX *dl_btc = residual; + + for (int l = L - 1; l >= 0; l--) + { + NvtxRange layer_range("Layer", l); + + residual = l == 0 ? acts.encoded : acts.residual3 + (l - 1) * B * T * C; + + floatX *l_ln1w = params.ln1w + l * C; + floatX *l_ln1b = params.ln1b + l * C; + floatX *l_qkvw = params.qkvw + l * 3 * C * C; + floatX *l_attprojw = params.attprojw + l * C * C; + floatX *l_ln2w = params.ln2w + l * C; + floatX *l_ln2b = params.ln2b + l * C; + floatX *l_fcw = params.fcw + l * 4 * C * C; + floatX *l_fcprojw = params.fcprojw + l * C * 4 * C; + + floatX *dl_ln1w = grads.ln1w + l * C; + floatX *dl_ln1b = grads.ln1b + l * C; + floatX *dl_qkvw = grads.qkvw + l * 3 * C * C; + floatX *dl_qkvb = grads.qkvb + l * 3 * C; + floatX *dl_attprojw = grads.attprojw + l * C * C; + floatX *dl_attprojb = grads.attprojb + l * C; + floatX *dl_ln2w = grads.ln2w + l * C; + floatX *dl_ln2b = grads.ln2b + l * C; + floatX *dl_fcw = grads.fcw + l * 4 * C * C; + floatX *dl_fcb = grads.fcb + l * 4 * C; + floatX *dl_fcprojw = grads.fcprojw + l * C * 4 * C; + floatX *dl_fcprojb = grads.fcprojb + l * C; + + floatX *l_ln1 = (model->recompute < 2) ? acts.ln1 + l * B * T * C : acts.lnf; + float *l_ln1_mean = acts.ln1_mean + l * B * T; + float *l_ln1_rstd = acts.ln1_rstd + l * B * T; + floatX *l_qkvr = acts.qkvr + l * B * T * 3 * C; + floatX *l_atty = acts.atty + l * B * T * C; + floatX *l_residual2 = acts.residual2 + l * B * T * C; + floatX *l_ln2 = (model->recompute < 2) ? acts.ln2 + l * B * T * C : acts.lnf; + float *l_ln2_mean = acts.ln2_mean + l * B * T; + float *l_ln2_rstd = acts.ln2_rstd + l * B * T; + floatX *l_fch_pre_gelu = acts.fch + l * B * T * 4 * C; + floatX *l_fch_gelu = (model->recompute < 1) ? acts.fch_gelu + l * B * T * 4 * C : acts.fch_gelu; + + floatX *dl_bt4c = (floatX *)model->acts.scratch_bt4c; + + if (model->recompute >= 1) + { + + gelu_forward(l_fch_gelu, l_fch_pre_gelu, B * T * 4 * C, main_stream); + } + matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4 * C, C, main_stream, l_fch_pre_gelu, model->gelu_fusion); + if (model->recompute >= 2) + { + + layernorm_forward(l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C, main_stream); + } + matmul_backward(dl_btc, dl_fcw, dl_fcb, dl_bt4c, l_ln2, l_fcw, scratchF, B, T, C, 4 * C, main_stream); + + layernorm_backward(dresidual, dl_ln2w, dl_ln2b, scratchF, dl_btc, l_residual2, l_ln2w, l_ln2_mean, l_ln2_rstd, B, T, C, main_stream); + matmul_backward(dl_btc, dl_attprojw, dl_attprojb, dresidual, l_atty, l_attprojw, scratchF, B, T, C, C, main_stream); + +#ifdef ENABLE_CUDNN + float *l_att = (float *)acts.att + l * B * NH * T; + attention_backward_cudnn(dl_bt4c, dl_btc, l_qkvr, l_atty, (float *)l_att, B, T, NH, C, main_stream); +#else + floatX *l_att = acts.att + l * B * NH * T * T; + + floatX *buffer_a = l_atty; + floatX *buffer_b = l_fch_pre_gelu; + attention_backward(dl_bt4c, buffer_b, scratchX, buffer_a, dl_btc, l_qkvr, l_att, B, T, C, NH, main_stream); +#endif + if (model->recompute >= 2) + { + layernorm_forward(l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C, main_stream); + } + + matmul_backward(dl_btc, dl_qkvw, dl_qkvb, dl_bt4c, l_ln1, l_qkvw, scratchF, B, T, C, 3 * C, main_stream); + + layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C, main_stream); + + if (last_step) + { + floatX *const pointers[] = { + dl_ln1w, dl_ln1b, + dl_qkvw, dl_qkvb, + dl_attprojw, dl_attprojb, + dl_ln2w, dl_ln2b, + dl_fcw, dl_fcb, + dl_fcprojw, dl_fcprojb}; + const size_t nelem[] = { + C, C, + 3 * C * C, 3 * C, + C * C, C, + C, C, + 4 * C * C, 4 * C, + C * 4 * C, C}; + multi_gpu_async_reduce_gradient(pointers, nelem, &multi_gpu_config, main_stream); + } + } + encoder_backward(grads.wte, grads.wpe, scratchX, model->workload_indices, model->bucket_info, + dresidual, model->inputs, inputs, B, T, C, random_u32(&model->rng_state), main_stream); + + if (last_step) + { + + global_sum_deterministic(model->accumulated_mean_loss, acts.losses, B * T, main_stream); + +#if MULTI_GPU + ncclCheck(ncclAllReduce(model->accumulated_mean_loss, model->accumulated_mean_loss, sizeof(float), ncclFloat, ncclAvg, multi_gpu_config.nccl_comm, main_stream)); +#endif + cudaCheck(cudaMemcpyAsync(&model->mean_loss, model->accumulated_mean_loss, sizeof(float), cudaMemcpyDeviceToHost, main_stream)); + + floatX *const pointers[] = {grads.wte, grads.wpe, grads.lnfw, grads.lnfb}; + const size_t nelem[] = {Vp * C, T * C, C, C}; + multi_gpu_async_reduce_gradient(pointers, nelem, &multi_gpu_config, main_stream); + } + + cudaCheck(cudaDeviceSynchronize()); + if (last_step) + { + model->mean_loss /= B * T * grad_accum_steps; + } + else + { + model->mean_loss = -1.f; + } +} + +ShardInfo gpt2_get_tensor_at_layer(const GPT2 *model, int layer_id, int param_tensor_id) +{ + + ptrdiff_t offset = 0; + for (int i = 0; i < param_tensor_id; i++) + { + offset += (ptrdiff_t)model->param_elements[i]; + } + size_t size = model->param_elements[param_tensor_id]; + + if (2 <= param_tensor_id && param_tensor_id <= 13) + { + size /= model->config.num_layers; + offset += (ptrdiff_t)(layer_id * size); + } + return {offset, size}; +} + +float gpt2_calculate_grad_norm(GPT2 *model, MultiGpuConfig *multi_gpu_config) +{ + NVTX_RANGE_FN(); + floatX *grads_memory = (floatX *)model->grads_memory; + + float *grad_norm_squared = (float *)model->acts.output; + float grad_norm_squared_cpu = 0.0f; + + int num_slices[2] = {1, model->config.num_layers}; + int max_num_block_sums = get_max_num_block_sums(num_slices, 2); + if (multi_gpu_config->zero_stage == 1) + { + + for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) + { + ShardInfo tensor = gpt2_get_tensor_at_layer(model, 0, i); + ShardInfo shard = multi_gpu_get_shard_offset(tensor.size, multi_gpu_config, 1); + ptrdiff_t offset = tensor.offset + shard.offset; + bool is_first_pass = (i == 0); + if ((i < 2 || i > 13)) + { + global_norm_squared(grad_norm_squared, grads_memory + offset, shard.size, 0, 1, + max_num_block_sums, is_first_pass, main_stream); + } + else + { + global_norm_squared(grad_norm_squared, grads_memory + offset, shard.size, tensor.size, model->config.num_layers, + max_num_block_sums, is_first_pass, main_stream); + } + } + global_sum_deterministic(grad_norm_squared, grad_norm_squared, max_num_block_sums, main_stream); +#if MULTI_GPU + + ncclCheck(ncclAllReduce(grad_norm_squared, grad_norm_squared, sizeof(float), ncclFloat, ncclSum, multi_gpu_config->nccl_comm, main_stream)); +#endif + } + else + { + + global_norm_squared(grad_norm_squared, grads_memory, model->num_parameters, 0, 1, max_num_block_sums, true, main_stream); + global_sum_deterministic(grad_norm_squared, grad_norm_squared, max_num_block_sums, main_stream); + } + cudaCheck(cudaMemcpy(&grad_norm_squared_cpu, grad_norm_squared, sizeof(float), cudaMemcpyDeviceToHost)); + float grad_norm_cpu = sqrtf(grad_norm_squared_cpu); + return grad_norm_cpu; +} + +void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_scale, int t, + MultiGpuConfig *multi_gpu_config, bool init_from_master_only = false) +{ + + NVTX_RANGE_FN(); + if (model->grads_memory == nullptr || model->m_memory == nullptr || model->v_memory == nullptr) + { + fprintf(stderr, "Need to allocate optimizer state before update"); + exit(EXIT_FAILURE); + } + + bool init_state = model->init_state; + if (init_state) + { + model->init_state = false; + NvtxRange rng("InitOpt"); + cudaCheck(cudaMemset(model->m_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float))); + cudaCheck(cudaMemset(model->v_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float))); + } + + model->rng_state_last_update = model->rng_state; + + for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) + { + + unsigned int seed = random_u32(&model->rng_state); + + int num_layers = model->config.num_layers; + if ((i < 2 || i > 13)) + { + num_layers = 1; + } + + ShardInfo tensor = gpt2_get_tensor_at_layer(model, 0, i); + ShardInfo shard = multi_gpu_get_shard_offset(tensor.size, multi_gpu_config, 1); + ptrdiff_t local_offset_full = tensor.offset + shard.offset; + ptrdiff_t local_offset_partial = tensor.offset / multi_gpu_config->num_processes; + + float wd = (i == 0 || i == 1 || i == 4 || i == 6 || i == 10 || i == 12) ? weight_decay : 0.0f; + floatX *param_ptr = (floatX *)model->params_memory + local_offset_full; + floatX *grad_ptr = (floatX *)model->grads_memory + local_offset_full; + + ptrdiff_t opt_state_offset = multi_gpu_config->zero_stage < 1 ? local_offset_full : local_offset_partial; + float *m_ptr = model->m_memory + opt_state_offset; + float *v_ptr = model->v_memory + opt_state_offset; + float *master_ptr = nullptr; + if (model->master_weights != nullptr) + { + master_ptr = model->master_weights + opt_state_offset; + } + if (init_state && model->master_weights != nullptr) + { + size_t grid_size = CEIL_DIV(shard.size, 512); + copy_and_cast_kernel<<>>(master_ptr, param_ptr, shard.size, + shard.size, tensor.size); + cudaCheck(cudaGetLastError()); + } + + if (init_from_master_only) + { + + init_from_master(param_ptr, master_ptr, shard.size, tensor.size, shard.size, num_layers, seed, main_stream); + } + else + { + + adamw_update(param_ptr, master_ptr, grad_ptr, + m_ptr, v_ptr, + shard.size, tensor.size, tensor.size, shard.size, num_layers, + learning_rate, + beta1, beta2, t, eps, wd, grad_scale, seed, main_stream); + } + + if (multi_gpu_config->zero_stage == 1) + { +#if MULTI_GPU + ncclCheck(ncclGroupStart()); + for (int l = 0; l < num_layers; ++l) + { + + ncclCheck(ncclAllGather(param_ptr + l * tensor.size, + (floatX *)model->params_memory + tensor.offset + l * tensor.size, + shard.size, ncclFloatX, + multi_gpu_config->nccl_comm, multi_gpu_config->nccl_stream)); + } + ncclCheck(ncclGroupEnd()); +#endif + } + } + + cudaCheck(cudaDeviceSynchronize()); +} + +float gpt2_estimate_mfu(GPT2 *model, int num_tokens, float dt) +{ + + size_t N = model->num_parameters; + int L = model->config.num_layers; + int C = model->config.channels; + int T = model->seq_len; + size_t flops_per_token = 6 * N + (size_t)6 * L * C * T; + size_t flops_per_step = flops_per_token * num_tokens; + + float flops_achieved = (float)flops_per_step * (1.0f / dt); + float flops_promised = get_flops_promised(deviceProp.name, PRECISION_MODE) * 1e12f; + if (flops_promised < 0) + { + return -1.f; + } + float mfu = flops_achieved / flops_promised; + return mfu; +} + +void gpt2_free(GPT2 *model) +{ + cudaFreeCheck(&model->params_memory); + cudaFreeCheck(&model->grads_memory); + cudaFreeCheck(&model->m_memory); + cudaFreeCheck(&model->v_memory); + cudaFreeCheck(&model->master_weights); + cudaFreeCheck(&model->acts_memory); + cudaFreeCheck(&model->inputs); + cudaFreeCheck(&model->targets); + cudaFreeCheck(&model->accumulated_mean_loss); + cudaCheck(cudaFreeHost(model->cpu_losses)); + free(model->workload_indices); + free(model->bucket_info); +} + +void common_start(bool override_enable_tf32 = true, bool print_device_info = true) +{ + + cudaCheck(cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx)); + if (print_device_info) + { + printf("[System]\n"); + printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name); + } + + cudaCheck(cudaStreamCreate(&main_stream)); + nvtxNameCudaStreamA(main_stream, "main stream"); + + cublasCheck(cublasLtCreate(&cublaslt_handle)); + cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size)); + + bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32; + cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F; + +#ifdef ENABLE_CUDNN + create_cudnn(); +#endif +} + +void common_free(GPT2 &model) +{ + cudaCheck(cudaStreamDestroy(main_stream)); + cudaCheck(cudaFree(cublaslt_workspace)); + cublasCheck(cublasLtDestroy(cublaslt_handle)); +#ifdef ENABLE_CUDNN + destroy_cudnn(); +#endif +} + +void save_state(const char *filename, int step, GPT2 *model, DataLoader *loader) +{ + printf("Writing state to %s\n", filename); + FILE *state_file = fopenCheck(filename, "wb"); + int state_header[256]; + memset(state_header, 0, sizeof(state_header)); + + state_header[0] = 20240527; + state_header[1] = 1; + state_header[2] = multi_gpu_config.num_processes; + state_header[3] = multi_gpu_config.process_rank; + state_header[4] = model->use_master_weights; + state_header[5] = loader->should_shuffle; + + state_header[10] = step; + + *((unsigned long long *)&state_header[20]) = model->rng_state; + *((unsigned long long *)&state_header[22]) = model->rng_state_last_update; + + *((size_t *)&state_header[30]) = loader->current_shard_idx; + *((size_t *)&state_header[32]) = loader->current_sample_idx; + fwriteCheck(state_header, sizeof(int), 256, state_file); + + size_t shard_num_parameters = multi_gpu_config.shard_num_parameters; + device_to_file(state_file, model->m_memory, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream); + device_to_file(state_file, model->v_memory, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream); + if (model->use_master_weights) + { + device_to_file(state_file, model->master_weights, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream); + } + + if (loader->should_shuffle) + { + fwriteCheck(&loader->glob_result.gl_pathc, sizeof(size_t), 1, state_file); + fwriteCheck(loader->shard_indices, sizeof(int), loader->glob_result.gl_pathc, state_file); + fwriteCheck(&loader->shard_num_samples, sizeof(size_t), 1, state_file); + fwriteCheck(loader->intra_shard_indices, sizeof(int), loader->shard_num_samples, state_file); + fwriteCheck(&loader->shuffle_rng, sizeof(mt19937_state), 1, state_file); + } + fcloseCheck(state_file); +} + +void load_state(int *step, GPT2 *model, DataLoader *loader, const char *filename) +{ + FILE *state_file = fopenCheck(filename, "rb"); + int state_header[256]; + freadCheck(state_header, sizeof(int), 256, state_file); + assert(state_header[0] == 20240527); + assert(state_header[1] == 1); + assert(state_header[2] == multi_gpu_config.num_processes); + assert(state_header[3] == multi_gpu_config.process_rank); + int use_master_weights = state_header[4]; + int should_shuffle = state_header[5]; + *step = state_header[10]; + model->rng_state = *((unsigned long long *)&state_header[20]); + model->rng_state_last_update = *((unsigned long long *)&state_header[22]); + size_t current_shard_idx = *((size_t *)&state_header[30]); + size_t current_sample_idx = *((size_t *)&state_header[32]); + + size_t shard_num_parameters = multi_gpu_config.shard_num_parameters; + if (use_master_weights == 1 && !model->use_master_weights) + { + printf0("Warning: Master weights are present in state, but not enabled for current run."); + } + else if (use_master_weights == 0 && model->use_master_weights) + { + printf0("Error: Master weights requested, but not present in state file."); + exit(EXIT_FAILURE); + } + + model->init_state = false; + assert(model->m_memory != nullptr); + assert(model->v_memory != nullptr); + file_to_device(model->m_memory, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream); + file_to_device(model->v_memory, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream); + if (model->use_master_weights) + { + assert(model->master_weights != nullptr); + file_to_device(model->master_weights, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream); + + model->rng_state = model->rng_state_last_update; + gpt2_update(model, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0, &multi_gpu_config, true); + model->rng_state = *((unsigned long long *)&state_header[20]); + } + + loader->should_shuffle = should_shuffle; + if (should_shuffle == 1) + { + + size_t glob_result_gl_pathc; + freadCheck(&glob_result_gl_pathc, sizeof(size_t), 1, state_file); + assert(glob_result_gl_pathc == loader->glob_result.gl_pathc); + + loader->shard_indices = (int *)mallocCheck(loader->glob_result.gl_pathc * sizeof(int)); + freadCheck(loader->shard_indices, sizeof(int), loader->glob_result.gl_pathc, state_file); + + size_t shard_num_samples; + freadCheck(&shard_num_samples, sizeof(size_t), 1, state_file); + assert(shard_num_samples == loader->shard_num_samples); + + loader->intra_shard_indices = (int *)mallocCheck(loader->shard_num_samples * sizeof(int)); + freadCheck(loader->intra_shard_indices, sizeof(int), loader->shard_num_samples, state_file); + + freadCheck(&loader->shuffle_rng, sizeof(mt19937_state), 1, state_file); + } + dataloader_resume(loader, current_shard_idx, current_sample_idx); + + fcloseCheck(state_file); +} + +void write_checkpoint(const char *output_log_dir, int step, GPT2 *model, DataLoader *train_loader, MultiGpuConfig *multi_gpu_config) +{ + + printf0("Writing checkpoint at step %d\n", step); + int rank = multi_gpu_config->process_rank; + + if (rank == 0) + { + snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, step); + gpt2_write_to_checkpoint(model, filename_buffer); + } + + snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, step, rank); + save_state(filename_buffer, step, model, train_loader); + + multi_gpu_barrier(multi_gpu_config); + if (rank == 0) + { + snprintf(filename_buffer, sizeof(filename_buffer), "%s/DONE_%08d", output_log_dir, step); + FILE *done_file = fopenCheck(filename_buffer, "w"); + fcloseCheck(done_file); + } +} + +void delete_checkpoint(const char *output_log_dir, int step, MultiGpuConfig *multi_gpu_config) +{ + + printf0("Deleting checkpoint at step %d\n", step); + int rank = multi_gpu_config->process_rank; + if (rank == 0) + { + snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, step); + remove(filename_buffer); + } + snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, step, rank); + remove(filename_buffer); + if (rank == 0) + { + snprintf(filename_buffer, sizeof(filename_buffer), "%s/DONE_%08d", output_log_dir, step); + remove(filename_buffer); + } +} + +#ifndef TESTING + +void error_usage() +{ + fprintf(stderr, "Usage: ./train_gpt2cu [options]\n"); + fprintf(stderr, "Options:\n"); + + fprintf(stderr, " -i train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n"); + fprintf(stderr, " -j val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n"); + fprintf(stderr, " -e input .bin filename or descriptor, see code comments as docs. (default = gpt2_124M_bf16.bin)\n"); + fprintf(stderr, " -o output log dir (default = NULL, no logging)\n"); + fprintf(stderr, " -lg log gpu info every x steps (default = -1; disabled)\n"); + fprintf(stderr, " -n write optimization checkpoints every how many steps? (default 0, don't)\n"); + fprintf(stderr, " -nk max number of checkpoints to keep in the directory, removing old ones (0 = disable, default)\n"); + fprintf(stderr, " -nm every how many step checkpoints are considered major? major checkpoints never get deleted.\n"); + fprintf(stderr, " -y resume optimization found inside output log dir? (0=restart/overwrite, 1=resume/append)\n"); + + fprintf(stderr, " -b (per-GPU, micro) batch size B (default = 4)\n"); + fprintf(stderr, " -t sequence length T (default = 1024)\n"); + fprintf(stderr, " -d total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n"); + + fprintf(stderr, " -x max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n"); + + fprintf(stderr, " -k learning rate scheduler (default = cosine)\n"); + fprintf(stderr, " -l learning rate (default = 3e-4f)\n"); + fprintf(stderr, " -u learning rate warmup iterations (default = 0, no warmup)\n"); + fprintf(stderr, " -q learning rate decay: final fraction, at end of training (default = 1.0 (no decay))\n"); + fprintf(stderr, " -c weight decay (default = 0.0f)\n"); + fprintf(stderr, " -sl outlier stability: skip update if loss goes above this in zscore (0.0f=off)\n"); + fprintf(stderr, " -sg outlier stability: skip update if grad_norm goes above this in zscore (0.0f=off)\n"); + + fprintf(stderr, " -v val_loss_every, how often we evaluate val loss (default = 20)\n"); + fprintf(stderr, " -m val_max_steps, up to how many val batches to estimate val loss? (default = 20)\n"); + fprintf(stderr, " -s sample_every, how often we inference the model (default = 20)\n"); + fprintf(stderr, " -g genT, how many steps of inference we do (default = 64)\n"); + fprintf(stderr, " -h hellaswag eval run? (default = 0)\n"); + + fprintf(stderr, " -a overfit a single batch? 0/1. useful for debugging\n"); + + fprintf(stderr, " -f enable_tf32 override (default: 1, set to 0 to disable tf32)\n"); + fprintf(stderr, " -w keep f32 copy of weights for the optimizer? (default: 1)\n"); + fprintf(stderr, " -ge gelu fusion: 0=none, 1=forward, 2=forward+backward (default: 2 for >=SM90, 0 for older GPUs)\n"); + + fprintf(stderr, " -z zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n"); + fprintf(stderr, " -r recompute: less memory but less speed. (default = 1), 0|1|2 = none,gelu,gelu+ln\n"); + + fprintf(stderr, " -pn num_processes (default = 1)\n"); + fprintf(stderr, " -pr process_rank (default = 0)\n"); + fprintf(stderr, " -pg gpus_per_node (default = 8)\n"); + fprintf(stderr, " -pm nccl_init_method: tcp,fs,mpi (default = mpi)\n"); + fprintf(stderr, " -ps server_ip - used only when nccl_init_method is tcp (default = -1)\n"); + fprintf(stderr, " -pp fs_path - used only when nccl_init_method is fs (default = /tmp)\n"); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) +{ + + const char *train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin"; + const char *val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin"; + const char *load_filename = "gpt2_124M_bf16.bin"; + const char *lr_scheduler_type = "cosine"; + const char *output_log_dir = NULL; + int checkpoint_every = 0; + int checkpoints_keep = 0; + int major_checkpoint_every = 0; + int resume = 0; + int B = 4; + int T = 1024; + int total_batch_size = -1; + float learning_rate = 3e-4f; + int log_gpu_every = -1; + int warmup_iterations = 0; + float final_learning_rate_frac = 1.0f; + float weight_decay = 0.0f; + float skip_update_lossz = 0.0f; + float skip_update_gradz = 0.0f; + int val_loss_every = 20; + int val_max_steps = 20; + int sample_every = 20; + int genT = 64; + int overfit_single_batch = 0; + int max_steps = -1; + int override_enable_tf32 = 1; + int use_master_weights = 1; + int gelu_fusion = -1; + int recompute = 1; + int zero_stage = 0; + int hellaswag_eval = 0; + + int num_processes = 1; + int process_rank = 0; + int gpus_per_node = 8; + char nccl_init_method[256] = "mpi"; + char server_ip[256] = ""; + char fs_path[256] = ""; + for (int i = 1; i < argc; i += 2) + { + if (i + 1 >= argc) + { + error_usage(); + } + if (argv[i][0] != '-') + { + error_usage(); + } + if (!(strlen(argv[i]) == 2 || strlen(argv[i]) == 3)) + { + error_usage(); + } + + if (argv[i][1] == 'i') + { + train_data_pattern = argv[i + 1]; + } + else if (argv[i][1] == 'j') + { + val_data_pattern = argv[i + 1]; + } + else if (argv[i][1] == 'e') + { + load_filename = argv[i + 1]; + } + else if (argv[i][1] == 'o') + { + output_log_dir = argv[i + 1]; + } + else if (argv[i][1] == 'n' && argv[i][2] == '\0') + { + checkpoint_every = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'y') + { + resume = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'b') + { + B = atoi(argv[i + 1]); + } + else if (argv[i][1] == 't') + { + T = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'd') + { + total_batch_size = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'l' && argv[i][2] == '\0') + { + learning_rate = atof(argv[i + 1]); + } + else if (argv[i][1] == 'l' && argv[i][2] == 'g') + { + log_gpu_every = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'u') + { + warmup_iterations = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'q') + { + final_learning_rate_frac = atof(argv[i + 1]); + } + else if (argv[i][1] == 'c') + { + weight_decay = atof(argv[i + 1]); + } + else if (argv[i][1] == 'x') + { + max_steps = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'v') + { + val_loss_every = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'm') + { + val_max_steps = atoi(argv[i + 1]); + } + else if (argv[i][1] == 's' && argv[i][2] == '\0') + { + sample_every = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'g' && argv[i][2] == 'e') + { + gelu_fusion = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'g') + { + genT = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'a') + { + overfit_single_batch = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'f') + { + override_enable_tf32 = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'w') + { + use_master_weights = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'z') + { + zero_stage = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'r') + { + recompute = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'h') + { + hellaswag_eval = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'k') + { + lr_scheduler_type = argv[i + 1]; + } + else if (argv[i][1] == 'p' && argv[i][2] == 'i') + { + strcpy(nccl_init_method, argv[i + 1]); + } + else if (argv[i][1] == 'p' && argv[i][2] == 'f') + { + strcpy(fs_path, argv[i + 1]); + } + else if (argv[i][1] == 'p' && argv[i][2] == 's') + { + strcpy(server_ip, argv[i + 1]); + } + else if (argv[i][1] == 'p' && argv[i][2] == 'n') + { + num_processes = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'p' && argv[i][2] == 'r') + { + process_rank = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'p' && argv[i][2] == 'g') + { + gpus_per_node = atoi(argv[i + 1]); + } + else if (argv[i][1] == 's' && argv[i][2] == 'l') + { + skip_update_lossz = atof(argv[i + 1]); + } + else if (argv[i][1] == 's' && argv[i][2] == 'g') + { + skip_update_gradz = atof(argv[i + 1]); + } + else if (argv[i][1] == 'n' && argv[i][2] == 'k') + { + checkpoints_keep = atoi(argv[i + 1]); + } + else if (argv[i][1] == 'n' && argv[i][2] == 'm') + { + major_checkpoint_every = atoi(argv[i + 1]); + } + else + { + error_usage(); + } + } + + multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, server_ip, fs_path, nccl_init_method); + common_start(override_enable_tf32, false); + + assert(warmup_iterations >= 0); + if (output_log_dir != NULL) + { + assert(strlen(output_log_dir) < 400); + } + int tokens_per_fwdbwd = B * T * multi_gpu_config.num_processes; + + if (total_batch_size == -1) + { + total_batch_size = tokens_per_fwdbwd; + } + + if (gelu_fusion == -1) + { + gelu_fusion = 0; + } + + assert(total_batch_size % tokens_per_fwdbwd == 0); + int grad_accum_steps = total_batch_size / tokens_per_fwdbwd; + + if (overfit_single_batch == 1) + { + train_data_pattern = val_data_pattern; + } + printf0("+-----------------------+----------------------------------------------------+\n"); + printf0("| Parameter | Value |\n"); + printf0("+-----------------------+----------------------------------------------------+\n"); + printf0("| train data pattern | %-50s |\n", train_data_pattern); + printf0("| val data pattern | %-50s |\n", val_data_pattern); + printf0("| output log dir | %-50s |\n", output_log_dir == NULL ? "NULL" : output_log_dir); + printf0("| checkpoint_every | %-50d |\n", checkpoint_every); + printf0("| resume | %-50d |\n", resume); + printf0("| micro batch size B | %-50d |\n", B); + printf0("| sequence length T | %-50d |\n", T); + printf0("| total batch size | %-50d |\n", total_batch_size); + printf0("| LR scheduler | %-50s |\n", lr_scheduler_type); + printf0("| learning rate (LR) | %-50e |\n", learning_rate); + printf0("| warmup iterations | %-50d |\n", warmup_iterations); + printf0("| final LR fraction | %-50e |\n", final_learning_rate_frac); + printf0("| weight decay | %-50e |\n", weight_decay); + printf0("| skip update lossz | %-50f |\n", skip_update_lossz); + printf0("| skip update gradz | %-50f |\n", skip_update_gradz); + printf0("| max_steps | %-50d |\n", max_steps); + printf0("| val_loss_every | %-50d |\n", val_loss_every); + printf0("| val_max_steps | %-50d |\n", val_max_steps); + printf0("| sample_every | %-50d |\n", sample_every); + printf0("| genT | %-50d |\n", genT); + printf0("| overfit_single_batch | %-50d |\n", overfit_single_batch); + printf0("| use_master_weights | %-50s |\n", use_master_weights ? "enabled" : "disabled"); + printf0("| gelu_fusion | %-50d |\n", gelu_fusion); + printf0("| recompute | %-50d |\n", recompute); + printf0("+-----------------------+----------------------------------------------------+\n"); + const char *precision_str = (PRECISION_MODE == PRECISION_FP32) + ? (cublas_compute == CUBLAS_COMPUTE_32F_FAST_TF32 ? "TF32" : "FP32") + : (PRECISION_MODE == PRECISION_FP16 ? "FP16" : "BF16"); + printf0("| device | %-50s |\n", deviceProp.name); + printf0("| peak TFlops | %-50.1f |\n", get_flops_promised(deviceProp.name, PRECISION_MODE)); + printf0("| precision | %-50s |\n", precision_str); + printf0("+-----------------------+----------------------------------------------------+\n"); + + int resuming = 0; + + int resume_max_step = find_max_step(output_log_dir); + if (resume == 1) + { + assert(output_log_dir != NULL); + if (resume_max_step != -1) + { + resuming = 1; + snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, resume_max_step); + } + } + + GPT2 model; + gpt2_init_common(&model); + if (resuming == 1) + { + + bool weight_init = !use_master_weights; + gpt2_build_from_checkpoint(&model, filename_buffer, weight_init); + } + else if (ends_with_bin(load_filename)) + { + + gpt2_build_from_checkpoint(&model, load_filename); + } + else + { + + gpt_build_from_descriptor(&model, load_filename); + } + + model.use_master_weights = use_master_weights; + model.gelu_fusion = gelu_fusion; + model.recompute = recompute; + printf0("| weight init method | %-50s |\n", resuming == 1 ? "intermediate checkpoint" : load_filename); + printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len); + printf0("| vocab_size V | %-50d |\n", model.config.vocab_size); + printf0("| padded_vocab_size Vp | %-50d |\n", model.config.padded_vocab_size); + printf0("| num_layers L | %-50d |\n", model.config.num_layers); + printf0("| num_heads NH | %-50d |\n", model.config.num_heads); + printf0("| channels C | %-50d |\n", model.config.channels); + printf0("| num_parameters | %-50zu |\n", model.num_parameters); + printf0("+-----------------------+----------------------------------------------------+\n"); + + int permute_train_loader = (overfit_single_batch == 1) ? 0 : 1; + DataLoader train_loader, val_loader; + dataloader_init(&train_loader, train_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes, permute_train_loader); + dataloader_init(&val_loader, val_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes, 0); + + int train_num_batches = max_steps; + if (train_num_batches == -1) + { + + size_t ntok = train_loader.num_tokens; + + train_num_batches = ntok / total_batch_size; + } + + int val_num_batches = val_max_steps; + if (val_num_batches == -1) + { + + size_t ntok = val_loader.num_tokens; + + val_num_batches = ntok / tokens_per_fwdbwd; + } + printf0("| train_num_batches | %-50d |\n", train_num_batches); + printf0("| val_num_batches | %-50d |\n", val_num_batches); + printf0("+-----------------------+----------------------------------------------------+\n"); + + EvalLoader eval_loader; + const char *hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin"; + const bool hellaswag_available = access(hellaswag_path, F_OK) == 0; + const bool run_hellaswag = hellaswag_eval && hellaswag_available; + if (run_hellaswag) + { + evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes); + } + printf0("| run hellaswag | %-50s |\n", run_hellaswag ? "yes" : "no"); + printf0("+-----------------------+----------------------------------------------------+\n"); + + set_zero_configs(&multi_gpu_config, zero_stage, model.num_parameters); + printf0("| num_processes | %-50d |\n", multi_gpu_config.num_processes); + printf0("| zero_stage | %-50d |\n", multi_gpu_config.zero_stage); + printf0("+-----------------------+----------------------------------------------------+\n"); + + if (!hellaswag_available) + { + printf0("HellaSwag eval not found at %s, skipping its evaluation\n", hellaswag_path); + printf0("You can run `python dev/data/hellaswag.py` to export and use it with `-h 1`.\n"); + } + + printf0("num_parameters: %zu => bytes: %zu\n", model.num_parameters, model.num_parameters_bytes); + printf0("allocated %d MiB for model parameters\n", (int)round(model.num_parameters_bytes / (1024 * 1024))); + + printf0("batch_size B=%d * seq_len T=%d * num_processes=%d and total_batch_size=%d\n", + B, T, multi_gpu_config.num_processes, total_batch_size); + printf0("=> setting grad_accum_steps=%d\n", grad_accum_steps); + + if (multi_gpu_config.process_rank == 0) + { + create_dir_if_not_exists(output_log_dir); + } + Logger logger; + logger_init(&logger, output_log_dir, multi_gpu_config.process_rank, resume); + + Tokenizer tokenizer; + tokenizer_init(&tokenizer, "gpt2_tokenizer.bin"); + + LearningRateScheduler lr_scheduler; + lr_scheduler_init(&lr_scheduler, lr_scheduler_type, learning_rate, + warmup_iterations, train_num_batches, final_learning_rate_frac); + + int *gen_tokens = (int *)mallocCheck(B * T * sizeof(int)); + floatX *cpu_logits_raw = (floatX *)mallocCheck(model.config.vocab_size * sizeof(floatX)); + float *cpu_logits = (float *)mallocCheck(model.config.vocab_size * sizeof(float)); + + int step = 0; + gpt2_allocate_state(&model, B, T); + if (resuming == 1) + { + snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, resume_max_step, multi_gpu_config.process_rank); + load_state(&step, &model, &train_loader, filename_buffer); + } + + OutlierDetector loss_outlier_detector, grad_norm_outlier_detector; + init_detector(&loss_outlier_detector); + init_detector(&grad_norm_outlier_detector); + + if (T < model.config.max_seq_len) + { + printf0("!!!!!!!!\n"); + printf0("WARNING:\n"); + printf0("- The training sequence length is: T=%d (set with -t)\n", T); + printf0("- The model's max sequence length is: max_seq_len=%d\n", model.config.max_seq_len); + printf0("You are attempting to train with a sequence length shorter than the model's max.\n"); + printf0("This will lead to unused parameters in the wpe position embedding weights.\n"); + printf0("If you know what you're doing you can ignore this warning.\n"); + printf0("If you're like ???, you are most likely misconfiguring your training run.\n"); + printf0("---> HINT: If you're training GPT-2 use -t 1024. If GPT-3, use -t 2048.\n"); + printf0("!!!!!!!!\n"); + } + + assert(T <= model.config.max_seq_len); + + cudaEvent_t start, end; + cudaCheck(cudaEventCreate(&start)); + cudaCheck(cudaEventCreate(&end)); + cudaCheck(cudaProfilerStart()); + double total_sum_iteration_time_s = 0.0; + float ema_tokens_per_second = 0.0f; + for (; step <= train_num_batches; step++) + { + NvtxRange step_range("Train step", step); + + int last_step = step == train_num_batches; + + if (step % val_loss_every == 0 || last_step) + { + NvtxRange validation_range("validation"); + float val_loss = 0.0f; + dataloader_reset(&val_loader); + for (int i = 0; i < val_num_batches; i++) + { + dataloader_next_batch(&val_loader); + val_loss += gpt2_validate(&model, val_loader.inputs, val_loader.targets, B, T); + } + val_loss /= val_num_batches; + val_loss = multi_gpu_cpu_float_sum(val_loss, &multi_gpu_config) / multi_gpu_config.num_processes; + printf0("val loss %f\n", val_loss); + logger_log_val(&logger, step, val_loss); + } + + if (run_hellaswag && + ((step > 0 && step % val_loss_every == 0) || last_step)) + { + NvtxRange evaluation_range("evaluation"); + float eval_acc_norm = 0.0f; + evalloader_reset(&eval_loader); + for (int i = 0; i < eval_loader.num_batches; i++) + { + if (i % 10 == 0) + { + printf("evaluating HellaSwag: %d/%d\r", i, eval_loader.num_batches); + } + evalloader_next_batch(&eval_loader); + gpt2_validate(&model, eval_loader.inputs, eval_loader.targets, B, T); + int correct = evalloader_stat_losses(&eval_loader, model.cpu_losses); + eval_acc_norm += (float)correct; + } + + eval_acc_norm = multi_gpu_cpu_float_sum(eval_acc_norm, &multi_gpu_config); + printf0("HellaSwag: %d/%d = %f\n", (int)eval_acc_norm, eval_loader.num_examples, eval_acc_norm / eval_loader.num_examples); + logger_log_eval(&logger, step, eval_acc_norm / eval_loader.num_examples); + } + + if (multi_gpu_config.process_rank == 0 && sample_every > 0 && + (step > 0 && (step % sample_every) == 0 || last_step)) + { + NvtxRange generation_range("generation"); + unsigned long long sample_rng_state = 1337; + + int eot_token = tokenizer.eot_token; + for (int i = 0; i < B * T; ++i) + { + gen_tokens[i] = eot_token; + } + + printf("generating:\n---\n"); + for (int t = 1; t < genT; t++) + { + NvtxRange generation_range("Generation step", t); + + gpt2_forward(&model, gen_tokens, 1, CEIL_DIV(t, min(T, 256)) * min(T, 256)); + + floatX *logits = model.acts.output + (t - 1) * model.config.padded_vocab_size; + + cudaCheck(cudaMemcpy(cpu_logits_raw, logits, model.config.vocab_size * sizeof(floatX), cudaMemcpyDeviceToHost)); + + for (int i = 0; i < model.config.vocab_size; i++) + { + cpu_logits[i] = (float)cpu_logits_raw[i]; + } + + float coin = random_f32(&sample_rng_state); + int next_token = sample_softmax(cpu_logits, model.config.vocab_size, coin); + gen_tokens[t] = next_token; + + if (tokenizer.init_ok) + { + const char *token_str = tokenizer_decode(&tokenizer, next_token); + safe_printf(token_str); + } + else + { + + printf("%d ", next_token); + } + fflush(stdout); + } + printf("\n---\n"); + } + + if ((checkpoint_every > 0 && output_log_dir != NULL && resuming == 0) && + ((step > 0 && step % checkpoint_every == 0) || last_step)) + { + + write_checkpoint(output_log_dir, step, &model, &train_loader, &multi_gpu_config); + + int step_delete = step - checkpoints_keep * checkpoint_every; + if (checkpoints_keep > 0 && step_delete > 0 && + (major_checkpoint_every == 0 || step_delete % major_checkpoint_every != 0)) + { + delete_checkpoint(output_log_dir, step_delete, &multi_gpu_config); + } + } + resuming = 0; + + if (last_step) + { + break; + } + + if (overfit_single_batch == 1) + { + + dataloader_reset(&train_loader); + } + + cudaCheck(cudaEventRecord(start)); + + for (int micro_step = 0; micro_step < grad_accum_steps; micro_step++) + { + + dataloader_next_batch(&train_loader); + + gpt2_forward(&model, train_loader.inputs, B, T); + + gpt2_backward_and_reduce(&model, train_loader.inputs, train_loader.targets, grad_accum_steps, micro_step); + } + float zloss = (float)(update_detector(&loss_outlier_detector, (double)model.mean_loss)); + + float step_learning_rate = get_learning_rate(&lr_scheduler, step); + + float grad_norm = gpt2_calculate_grad_norm(&model, &multi_gpu_config); + float zgrad = (float)(update_detector(&grad_norm_outlier_detector, (double)grad_norm)); + + if (isfinite(zloss) && skip_update_lossz != 0.0f && zloss > skip_update_lossz) + { + printf0("skipping update due to loss z-score of %f\n", zloss); + } + else if (isfinite(zgrad) && skip_update_gradz != 0.0f && zgrad > skip_update_gradz) + { + printf0("skipping update due to grad z-score of %f\n", zgrad); + } + else + { + + float grad_clip = 1.0f; + float grad_scale = (grad_norm > grad_clip) ? grad_clip / grad_norm : 1.0f; + gpt2_update(&model, step_learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, grad_scale, step + 1, &multi_gpu_config); + } + cudaCheck(cudaEventRecord(end)); + cudaCheck(cudaEventSynchronize(end)); + + float time_elapsed_ms; + cudaCheck(cudaEventElapsedTime(&time_elapsed_ms, start, end)); + size_t tokens_processed = (size_t)multi_gpu_config.num_processes * B * T * grad_accum_steps; + float tokens_per_second = tokens_processed / time_elapsed_ms * 1000.0f; + float bias_corrected_ema_tokens_per_second = tokens_per_second; + if (step > 0) + { + total_sum_iteration_time_s += time_elapsed_ms / 1000.0f; + + ema_tokens_per_second = 0.95f * ema_tokens_per_second + 0.05f * tokens_per_second; + bias_corrected_ema_tokens_per_second = ema_tokens_per_second / (1.0f - powf(0.95f, step)); + } + float mfu = gpt2_estimate_mfu(&model, B * T * grad_accum_steps, time_elapsed_ms / 1000.0f); + printf0("step %4d/%d | loss %7.6f (%+.2fz)| norm %6.4f (%+.2fz)| lr %.2e | %.2f ms | %.1f%% bf16 MFU | %.0f tok/s\n", + step + 1, train_num_batches, model.mean_loss, zloss, grad_norm, zgrad, step_learning_rate, + time_elapsed_ms, 100 * mfu, bias_corrected_ema_tokens_per_second); + if (log_gpu_every > 0 && (step + 1) % log_gpu_every == 0) + { + GPUUtilInfo gpu_info = get_gpu_utilization_info(); + printf0(" compute %2.1f%% | memory: %2.1f%% | fan: %2d%% | %4d MHz / %4d MHz | %3d W / %3d W | %d°C / %d°C | %s\n", + gpu_info.gpu_utilization, gpu_info.mem_utilization, gpu_info.fan, gpu_info.clock, gpu_info.max_clock, gpu_info.power / 1000, gpu_info.power_limit / 1000, + gpu_info.temperature, gpu_info.temp_slowdown, gpu_info.throttle_reason); + } + logger_log_train(&logger, step, model.mean_loss, step_learning_rate, grad_norm); + + if (step == 3) + { + cudaProfilerStop(); + } + } + + printf0("total average iteration time: %f ms\n", total_sum_iteration_time_s / (train_num_batches - 1) * 1000); + + cudaCheck(cudaEventDestroy(end)); + cudaCheck(cudaEventDestroy(start)); + if (run_hellaswag) + { + evalloader_free(&eval_loader); + } + dataloader_free(&train_loader); + dataloader_free(&val_loader); + tokenizer_free(&tokenizer); + free(cpu_logits_raw); + free(cpu_logits); + free(gen_tokens); + multi_gpu_config_free(&multi_gpu_config); + gpt2_free(&model); + common_free(model); + return 0; +} +#endif \ No newline at end of file diff --git a/LICENSE b/LICENSE index 804d8ed..c3222c6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2026 Eamon +Copyright(c) 2026 Eamon Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ccb6702 --- /dev/null +++ b/Makefile @@ -0,0 +1,104 @@ +# ============================================================================= +# Quadtrix.cpp — Makefile (llama.cpp-style convenience targets) +# ============================================================================= + +.PHONY: all build clean run dev gpu train bench logs ps shell help + +SHELL := /bin/bash +SCRIPT := ./scripts/build.sh + +# ── Native C++ ─────────────────────────────────────────────────────────────── +CC := g++ +CFLAGS := -std=c++17 -O3 -march=native +IFLAGS := -I. -Iinclude +TARGET := quadtrix +SRCS := main.cpp + +all: $(TARGET) + +$(TARGET): $(SRCS) + $(CC) $(CFLAGS) $(IFLAGS) -o $@ $^ + @echo "✓ Built $(TARGET)" + +# Optimised release (same flags, explicit target) +release: $(SRCS) + $(CC) $(CFLAGS) $(IFLAGS) -DNDEBUG -o $(TARGET) $^ + strip $(TARGET) + +# Debug build +debug: $(SRCS) + $(CC) -std=c++17 -O0 -g -fsanitize=address,undefined \ + $(IFLAGS) -o $(TARGET)-debug $^ + +benchmark-bin: benchmark.cpp + $(CC) $(CFLAGS) $(IFLAGS) -o quadtrix-bench $^ + +clean-native: + rm -f $(TARGET) $(TARGET)-debug quadtrix-bench + +# ── Docker / Compose targets ───────────────────────────────────────────────── +build: + $(SCRIPT) up + +run: build + @echo "Stack already started." + +dev: + $(SCRIPT) dev + +gpu: + $(SCRIPT) gpu + +train-cpp: + $(SCRIPT) train-cpp + +train-torch: + $(SCRIPT) train-torch + +bench: + $(SCRIPT) bench + +logs: + $(SCRIPT) logs + +ps: + $(SCRIPT) ps + +shell: + $(SCRIPT) shell $(SERVICE) + +clean: + $(SCRIPT) clean + +# ── Misc ───────────────────────────────────────────────────────────────────── +format: + find . \( -name "*.cpp" -o -name "*.h" \) \ + ! -path "./build/*" \ + | xargs clang-format -i --style=LLVM + +lint-py: + ruff check backend/ engine/ + +help: + @echo "" + @echo " Quadtrix.cpp — make targets" + @echo "" + @echo " Native:" + @echo " make Build C++ binary (native)" + @echo " make release Stripped release binary" + @echo " make debug Debug binary with ASan/UBSan" + @echo " make clean-native Remove native build artifacts" + @echo " make format Run clang-format on all C++ files" + @echo "" + @echo " Docker:" + @echo " make build docker compose up --build (CPU)" + @echo " make dev Hot-reload dev stack" + @echo " make gpu CUDA GPU stack" + @echo " make train-cpp Train with C++ inside Docker" + @echo " make train-torch Train with PyTorch inside Docker" + @echo " make bench Run benchmark" + @echo " make logs Tail all logs" + @echo " make ps Show container status" + @echo " make shell Shell into backend (SERVICE=frontend to change)" + @echo " make clean Remove containers + volumes" + @echo "" diff --git a/README.md b/README.md index 0feeebe..56f99cc 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # Quadtrix.cpp +

+ image +

+ A local large language model with a modular, multi-path execution architecture. Train, run inference, and serve a chat interface — all from a single repository, across bare-metal C++, PyTorch, and a React frontend. > Full technical reference: [docs](https://eamon2009.github.io/LLMs/) diff --git a/config/config.h b/config/config.h index db053cb..844efeb 100644 --- a/config/config.h +++ b/config/config.h @@ -1,34 +1,18 @@ #pragma once -// ============================================================ -// config/config.h – Global constants (mirrors config/config.py) -// ============================================================ - #include - -// ── Paths ──────────────────────────────────────────────────── -// Set CLEANED_PATH to your input text file before compiling, -// or override at runtime via the env-var GPT_DATA_PATH. static const std::string DEFAULT_CLEANED_PATH = "data/input.txt"; static const std::string DATA_PATH_ENV_VAR = "GPT_DATA_PATH"; - -// ── Reproducibility ────────────────────────────────────────── static const unsigned int SEED = 1337; - -// ── Data split ─────────────────────────────────────────────── static const double TRAIN_SPLIT = 0.9; // 90 % train, 10 % val - -// ── Hyper-parameters (identical to the Python script) ─────── static const int BATCH_SIZE = 4; static const int BLOCK_SIZE = 64; // context length -static const int MAX_ITERS = 3000; +static const int MAX_ITERS = 10000; static const int EVAL_INTERVAL = 20; static const float LEARNING_RATE = 3e-4f; -static const int EVAL_ITERS = 10; +static const int EVAL_ITERS = 1; static const int N_EMBD = 128; static const int N_HEAD = 4; static const int N_LAYER = 4; static const float DROPOUT = 0.2f; // applied during training only - -// ── Output paths ───────────────────────────────────────────── static const std::string BEST_MODEL_PATH = "best_model.bin"; static const std::string MODEL_PATH_ENV_VAR = "GPT_MODEL_PATH"; diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000..a2e9a85 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,45 @@ +services: + frontend: + build: + context: . + dockerfile: .devops/Dockerfile.dev.frontend + ports: + - "5173:5173" + volumes: + - ./frontend:/app:delegated + - /app/node_modules + environment: + VITE_API_BASE_URL: "http://localhost:3001" + command: [ "npm", "run", "dev", "--", "--host", "0.0.0.0" ] + healthcheck: + test: [ "CMD", "wget", "-qO-", "http://localhost:5173/" ] + interval: 15s + timeout: 5s + retries: 5 + + backend: + volumes: + - ./backend:/app/backend:delegated + - ./engine:/app/engine:delegated + - models:/models + environment: + LOG_LEVEL: DEBUG + CORS_ORIGINS: "http://localhost:5173,http://localhost:3001" + command: + - python + - -m + - uvicorn + - main:app + - --host + - "0.0.0.0" + - --port + - "3001" + - --reload + - --reload-dir + - /app/backend + + redis: + ports: + - "6379:6379" +volumes: + models: diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml new file mode 100644 index 0000000..abbd02e --- /dev/null +++ b/docker-compose.gpu.yml @@ -0,0 +1,32 @@ +services: + backend: + build: + args: + CUDA: "1" + image: quadtrix/backend-cuda:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [ gpu ] + environment: + CUDA_VISIBLE_DEVICES: "0" + TORCH_CHECKPOINT_PATH: /models/best_model.pt + + train-torch: + build: + args: + CUDA: "1" + image: quadtrix/backend-cuda:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [ gpu ] + environment: + CUDA_VISIBLE_DEVICES: "0" + QUADTRIX_TRAIN_DATA: /app/data/input.txt diff --git a/docker-compose.yml b/docker-compose.yml index 8191856..7bb3572 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,34 +1,173 @@ +name: quadtrix + +x-common-env: &common-env + TZ: UTC + PYTHONUNBUFFERED: "1" + services: - quadtrix: - image: ghcr.io/eamon2009/quadtrix.cpp:latest + + frontend: build: context: . - dockerfile: Dockerfile + dockerfile: .devops/Dockerfile.frontend args: - # for cuda - # BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu24.04 - BASE_IMAGE: ubuntu:24.04 - + VITE_API_BASE_URL: "" + image: quadtrix/frontend:latest + container_name: quadtrix-frontend + restart: unless-stopped ports: - - "3001:3001" # FastAPI backend - - "8080:8080" # React frontend - - volumes: - # Place best_model.pt and/or best_model.bin inside ./models/ - - ./models:/app/models + - "5173:80" + depends_on: + backend: + condition: service_healthy + networks: + - quadtrix-net + healthcheck: + test: [ "CMD", "wget", "-qO-", "http://localhost/" ] + interval: 30s + timeout: 5s + retries: 3 + backend: + build: + context: . + dockerfile: .devops/Dockerfile.backend + image: quadtrix/backend:latest + container_name: quadtrix-backend + restart: unless-stopped + ports: + - "3001:3001" environment: - TORCH_CHECKPOINT_PATH: /app/models/best_model.pt - GPT_MODEL_PATH: /app/models/best_model.bin - CORS_ORIGINS: http://localhost:8080 + <<: *common-env + API_PORT: "3001" + CORS_ORIGINS: "http://localhost:5173,http://frontend" + REDIS_URL: "redis://redis:6379/0" + TORCH_CHECKPOINT_PATH: /models/best_model.pt LOG_LEVEL: INFO - MAX_SESSIONS: 1000 - SESSION_TTL_HOURS: 24 - restart: unless-stopped - + MAX_SESSIONS: "500" + SESSION_TTL_HOURS: "24" + volumes: + - models:/models + - ./engine:/app/engine:ro + depends_on: + redis: + condition: service_healthy + networks: + - quadtrix-net healthcheck: test: [ "CMD", "curl", "-f", "http://localhost:3001/api/health" ] interval: 30s timeout: 10s - retries: 5 start_period: 20s + retries: 3 + + redis: + image: redis:7-alpine + container_name: quadtrix-redis + restart: unless-stopped + command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru + volumes: + - redis-data:/data + networks: + - quadtrix-net + healthcheck: + test: [ "CMD", "redis-cli", "ping" ] + interval: 10s + timeout: 5s + retries: 5 + expose: + - "6379" + + cpp: + build: + context: . + dockerfile: .devops/Dockerfile.cpp + image: quadtrix/cpp:latest + container_name: quadtrix-cpp + + restart: "no" + stdin_open: true + tty: true + volumes: + - models:/models + - ./data:/app/data:ro + environment: + <<: *common-env + GPT_DATA_PATH: /app/data/input.txt + GPT_MODEL_PATH: /models/best_model.bin + networks: + - quadtrix-net + profiles: + - cpp + + train-cpp: + build: + context: . + dockerfile: .devops/Dockerfile.cpp + image: quadtrix/cpp:latest + container_name: quadtrix-train-cpp + restart: "no" + volumes: + - models:/models + - ./data:/app/data:ro + environment: + <<: *common-env + GPT_DATA_PATH: /app/data/input.txt + GPT_MODEL_PATH: /models/best_model.bin + command: [ "data/input.txt" ] # train mode (no --chat flag) + networks: + - quadtrix-net + profiles: + - train + + train-torch: + build: + context: . + dockerfile: .devops/Dockerfile.backend + image: quadtrix/backend:latest + container_name: quadtrix-train-torch + restart: "no" + volumes: + - models:/models + - ./engine:/app/engine + - ./data:/app/data:ro + environment: + <<: *common-env + QUADTRIX_TRAIN_DATA: /app/data/input.txt + working_dir: /app + command: [ "python", "engine/main.py" ] + networks: + - quadtrix-net + profiles: + - train + + benchmark: + build: + context: . + dockerfile: .devops/Dockerfile.cpp + image: quadtrix/cpp:latest + container_name: quadtrix-benchmark + restart: "no" + volumes: + - models:/models + - ./data:/app/data:ro + - ./benchmark_results.csv:/app/benchmark_results.csv + environment: + <<: *common-env + GPT_MODEL_PATH: /models/best_model.bin + + command: [ "data/input.txt", "--generate" ] + networks: + - quadtrix-net + profiles: + - benchmark + +volumes: + models: + driver: local + redis-data: + driver: local + +networks: + quadtrix-net: + driver: bridge diff --git a/frontend/src/components/chat/EmptyState.tsx b/frontend/src/components/chat/EmptyState.tsx index ce75d9a..abf94ec 100644 --- a/frontend/src/components/chat/EmptyState.tsx +++ b/frontend/src/components/chat/EmptyState.tsx @@ -1,13 +1,95 @@ export function EmptyState() { return ( -
-
-
- Quadtrix.cpp icon +
+
+ {/* Icon */} +
+ + + +
-
-

Quadtrix.cpp

-

Minimal local chat interface. Start typing below to begin.

+ +
+

+ Quadtrix.cpp +

+

+ Local char-level language model. Start a conversation below. +

+
+ + {/* Hint chips */} +
+ {["Fast local inference", "C++ & PyTorch backends", "No cloud required"].map((chip) => ( + + {chip} + + ))}
diff --git a/frontend/src/components/chat/MessageAvatar.tsx b/frontend/src/components/chat/MessageAvatar.tsx index 25373d5..c606c9d 100644 --- a/frontend/src/components/chat/MessageAvatar.tsx +++ b/frontend/src/components/chat/MessageAvatar.tsx @@ -6,15 +6,48 @@ interface MessageAvatarProps { export function MessageAvatar({ role }: MessageAvatarProps) { const isUser = role === "user"; + + if (isUser) { + return ( +
+ U +
+ ); + } + return (
- {isUser ? "You" : "Q"} + Q
); } diff --git a/frontend/src/components/chat/MessageList.tsx b/frontend/src/components/chat/MessageList.tsx index e38a0af..5de6e62 100644 --- a/frontend/src/components/chat/MessageList.tsx +++ b/frontend/src/components/chat/MessageList.tsx @@ -1,5 +1,6 @@ -import { useAutoScroll } from "../../hooks/useAutoScroll"; +import { useRef } from "react"; import type { Message } from "../../types"; +import { useAutoScroll } from "../../hooks/useAutoScroll"; import { MessageRow } from "./MessageRow"; interface MessageListProps { @@ -7,13 +8,25 @@ interface MessageListProps { } export function MessageList({ messages }: MessageListProps) { - const scrollRef = useAutoScroll(messages.length); + const bottomRef = useRef(null); + useAutoScroll(bottomRef, messages); + return ( -
-
+
+
{messages.map((message) => ( ))} +
); diff --git a/frontend/src/components/chat/MessageRow.tsx b/frontend/src/components/chat/MessageRow.tsx index 372d585..8dd3910 100644 --- a/frontend/src/components/chat/MessageRow.tsx +++ b/frontend/src/components/chat/MessageRow.tsx @@ -27,37 +27,96 @@ export function MessageRow({ message }: MessageRowProps) { return ( {!isUser && } -
-
- {isUser ? "You" : "Quadtrix"} + +
+ {/* Meta row */} +
+ {isUser ? "You" : "Quadtrix"} {formatRelativeTime(message.created_at)} {!isUser && !message.pending && ( )}
+ + {/* Bubble */}
- {message.pending ? : {message.text}} + {message.pending ? ( + + ) : ( + {message.text} + )}
+ {isUser && } ); diff --git a/frontend/src/components/chat/ThinkingIndicator.tsx b/frontend/src/components/chat/ThinkingIndicator.tsx index e83d0f5..7ec4a6c 100644 --- a/frontend/src/components/chat/ThinkingIndicator.tsx +++ b/frontend/src/components/chat/ThinkingIndicator.tsx @@ -1,12 +1,28 @@ export function ThinkingIndicator() { return ( -
- Quadtrix is thinking - - - - +
+ Generating + + {[0, 120, 240].map((delay) => ( + + ))} +
); } diff --git a/include/tensor.h b/include/tensor.h index f6ac4a5..c3526b6 100644 --- a/include/tensor.h +++ b/include/tensor.h @@ -1,8 +1,4 @@ #pragma once -// ============================================================ -// include/tensor.h – Lightweight 2-D / 3-D float tensor -// (CPU only – mirrors what PyTorch tensors do in the model) -// ============================================================ #include #include @@ -15,310 +11,557 @@ #include #include -// ------------------------------------------------------------------ -// Tensor (row-major, float32) -// shape is stored as {d0, d1} or {d0, d1, d2} -// ------------------------------------------------------------------ +#ifdef _OPENMP +#include +#endif + +#ifdef __AVX__ +#include +#endif + +#ifdef __SSE__ +#include +#endif + struct Tensor { - std::vector shape; - std::vector data; - - Tensor() = default; - - Tensor(std::vector sh, float fill = 0.0f) - : shape(sh) - { - int total = 1; - for (int d : sh) - total *= d; - data.assign(total, fill); - } - - int numel() const - { - int n = 1; - for (int d : shape) - n *= d; - return n; - } - - int ndim() const { return (int)shape.size(); } - - // ---- element access helpers -------------------------------- - float &at(int i) - { - assert(i >= 0 && i < (int)data.size()); - return data[i]; - } - float at(int i) const - { - assert(i >= 0 && i < (int)data.size()); - return data[i]; - } - - // 2-D - float &at(int r, int c) - { - return data[r * shape[1] + c]; - } - float at(int r, int c) const - { - return data[r * shape[1] + c]; - } - - // 3-D - float &at(int b, int r, int c) - { - return data[b * shape[1] * shape[2] + r * shape[2] + c]; - } - float at(int b, int r, int c) const - { - return data[b * shape[1] * shape[2] + r * shape[2] + c]; - } - - // ---- factory helpers --------------------------------------- - static Tensor zeros(std::vector sh) { return Tensor(sh, 0.0f); } - static Tensor ones(std::vector sh) { return Tensor(sh, 1.0f); } - - static Tensor randn(std::vector sh, float mean, float std, - std::mt19937 &rng) - { - std::normal_distribution dist(mean, std); - Tensor t(sh); - for (auto &v : t.data) - v = dist(rng); - return t; - } - - void fill(float v) { std::fill(data.begin(), data.end(), v); } - - // ---- print shape ------------------------------------------- - void print_shape(const std::string &name = "") const - { - if (!name.empty()) - std::cout << name << ": "; - std::cout << "["; - for (int i = 0; i < (int)shape.size(); ++i) - { - std::cout << shape[i]; - if (i + 1 < (int)shape.size()) - std::cout << ", "; - } - std::cout << "]" << std::endl; - } -}; + std::vector shape; + std::vector data; + + Tensor() = default; + + Tensor(std::vector sh, float fill = 0.0f) : shape(std::move(sh)) + { + int total = 1; + for (int d : shape) + total *= d; + data.reserve(total); + data.assign(total, fill); + } + + Tensor(const Tensor &) = default; + Tensor(Tensor &&) noexcept = default; + Tensor &operator=(const Tensor &) = default; + Tensor &operator=(Tensor &&) noexcept = default; + + int numel() const + { + int n = 1; + for (int d : shape) + n *= d; + return n; + } -// ------------------------------------------------------------------ -// Basic math ops (in-place and returning new tensors) -// ------------------------------------------------------------------ + int ndim() const { return (int)shape.size(); } + + float &at(int i) { return data[i]; } + float at(int i) const { return data[i]; } + + float &at(int r, int c) { return data[r * shape[1] + c]; } + float at(int r, int c) const { return data[r * shape[1] + c]; } + + float &at(int b, int r, int c) { return data[b * shape[1] * shape[2] + r * shape[2] + c]; } + float at(int b, int r, int c) const { return data[b * shape[1] * shape[2] + r * shape[2] + c]; } + + static Tensor zeros(std::vector sh) { return Tensor(sh, 0.0f); } + static Tensor ones(std::vector sh) { return Tensor(sh, 1.0f); } + + static Tensor randn(std::vector sh, float mean, float std, std::mt19937 &rng) + { + std::normal_distribution dist(mean, std); + Tensor t(sh); + for (auto &v : t.data) + v = dist(rng); + return t; + } + + void fill(float v) { std::fill(data.begin(), data.end(), v); } + + void print_shape(const std::string &name = "") const + { + if (!name.empty()) + std::cout << name << ": "; + std::cout << "["; + for (int i = 0; i < (int)shape.size(); ++i) + { + std::cout << shape[i]; + if (i + 1 < (int)shape.size()) + std::cout << ", "; + } + std::cout << "]" << std::endl; + } +}; -// element-wise add (same shape) inline Tensor add(const Tensor &a, const Tensor &b) { - assert(a.data.size() == b.data.size()); - Tensor c(a.shape); - for (int i = 0; i < (int)a.data.size(); ++i) - c.data[i] = a.data[i] + b.data[i]; - return c; + assert(a.data.size() == b.data.size()); + Tensor c(a.shape); + size_t n = a.data.size(); + +#ifdef __AVX__ + size_t i = 0; + size_t vec_end = n & ~7ULL; + for (; i < vec_end; i += 8) + { + __m256 va = _mm256_loadu_ps(&a.data[i]); + __m256 vb = _mm256_loadu_ps(&b.data[i]); + __m256 vc = _mm256_add_ps(va, vb); + _mm256_storeu_ps(&c.data[i], vc); + } + for (; i < n; ++i) + c.data[i] = a.data[i] + b.data[i]; +#elif defined(__SSE__) + size_t i = 0; + size_t vec_end = n & ~3ULL; + for (; i < vec_end; i += 4) + { + __m128 va = _mm_loadu_ps(&a.data[i]); + __m128 vb = _mm_loadu_ps(&b.data[i]); + __m128 vc = _mm_add_ps(va, vb); + _mm_storeu_ps(&c.data[i], vc); + } + for (; i < n; ++i) + c.data[i] = a.data[i] + b.data[i]; +#else + for (size_t i = 0; i < n; ++i) + c.data[i] = a.data[i] + b.data[i]; +#endif + return c; +} + +inline void add_inplace(Tensor &a, const Tensor &b) +{ + assert(a.data.size() == b.data.size()); + size_t n = a.data.size(); + +#ifdef __AVX__ + size_t i = 0; + size_t vec_end = n & ~7ULL; + for (; i < vec_end; i += 8) + { + __m256 va = _mm256_loadu_ps(&a.data[i]); + __m256 vb = _mm256_loadu_ps(&b.data[i]); + __m256 vc = _mm256_add_ps(va, vb); + _mm256_storeu_ps(&a.data[i], vc); + } + for (; i < n; ++i) + a.data[i] += b.data[i]; +#elif defined(__SSE__) + size_t i = 0; + size_t vec_end = n & ~3ULL; + for (; i < vec_end; i += 4) + { + __m128 va = _mm_loadu_ps(&a.data[i]); + __m128 vb = _mm_loadu_ps(&b.data[i]); + __m128 vc = _mm_add_ps(va, vb); + _mm_storeu_ps(&a.data[i], vc); + } + for (; i < n; ++i) + a.data[i] += b.data[i]; +#else + for (size_t i = 0; i < n; ++i) + a.data[i] += b.data[i]; +#endif } -// scalar multiply inline Tensor scale(const Tensor &a, float s) { - Tensor c(a.shape); - for (int i = 0; i < (int)a.data.size(); ++i) - c.data[i] = a.data[i] * s; - return c; + Tensor c(a.shape); + size_t n = a.data.size(); + +#ifdef __AVX__ + size_t i = 0; + size_t vec_end = n & ~7ULL; + __m256 vs = _mm256_set1_ps(s); + for (; i < vec_end; i += 8) + { + __m256 va = _mm256_loadu_ps(&a.data[i]); + __m256 vc = _mm256_mul_ps(va, vs); + _mm256_storeu_ps(&c.data[i], vc); + } + for (; i < n; ++i) + c.data[i] = a.data[i] * s; +#elif defined(__SSE__) + size_t i = 0; + size_t vec_end = n & ~3ULL; + __m128 vs = _mm_set1_ps(s); + for (; i < vec_end; i += 4) + { + __m128 va = _mm_loadu_ps(&a.data[i]); + __m128 vc = _mm_mul_ps(va, vs); + _mm_storeu_ps(&c.data[i], vc); + } + for (; i < n; ++i) + c.data[i] = a.data[i] * s; +#else + for (size_t i = 0; i < n; ++i) + c.data[i] = a.data[i] * s; +#endif + return c; +} + +inline void scale_inplace(Tensor &a, float s) +{ + size_t n = a.data.size(); + +#ifdef __AVX__ + size_t i = 0; + size_t vec_end = n & ~7ULL; + __m256 vs = _mm256_set1_ps(s); + for (; i < vec_end; i += 8) + { + __m256 va = _mm256_loadu_ps(&a.data[i]); + __m256 vc = _mm256_mul_ps(va, vs); + _mm256_storeu_ps(&a.data[i], vc); + } + for (; i < n; ++i) + a.data[i] *= s; +#elif defined(__SSE__) + size_t i = 0; + size_t vec_end = n & ~3ULL; + __m128 vs = _mm_set1_ps(s); + for (; i < vec_end; i += 4) + { + __m128 va = _mm_loadu_ps(&a.data[i]); + __m128 vc = _mm_mul_ps(va, vs); + _mm_storeu_ps(&a.data[i], vc); + } + for (; i < n; ++i) + a.data[i] *= s; +#else + for (auto &v : a.data) + v *= s; +#endif } -// ReLU inline Tensor relu(const Tensor &a) { - Tensor c(a.shape); - for (int i = 0; i < (int)a.data.size(); ++i) - c.data[i] = std::max(0.0f, a.data[i]); - return c; + Tensor c(a.shape); + size_t n = a.data.size(); + +#ifdef __AVX__ + size_t i = 0; + size_t vec_end = n & ~7ULL; + __m256 zero = _mm256_setzero_ps(); + for (; i < vec_end; i += 8) + { + __m256 va = _mm256_loadu_ps(&a.data[i]); + __m256 vc = _mm256_max_ps(va, zero); + _mm256_storeu_ps(&c.data[i], vc); + } + for (; i < n; ++i) + c.data[i] = std::max(0.0f, a.data[i]); +#elif defined(__SSE__) + size_t i = 0; + size_t vec_end = n & ~3ULL; + __m128 zero = _mm_setzero_ps(); + for (; i < vec_end; i += 4) + { + __m128 va = _mm_loadu_ps(&a.data[i]); + __m128 vc = _mm_max_ps(va, zero); + _mm_storeu_ps(&c.data[i], vc); + } + for (; i < n; ++i) + c.data[i] = std::max(0.0f, a.data[i]); +#else + for (size_t i = 0; i < n; ++i) + c.data[i] = std::max(0.0f, a.data[i]); +#endif + return c; } -// Softmax along last dim for 3-D tensor [B, T, C] -inline Tensor softmax3d(const Tensor &a) +inline void relu_inplace(Tensor &a) { - int B = a.shape[0], T = a.shape[1], C = a.shape[2]; - Tensor out(a.shape); - for (int b = 0; b < B; ++b) - { - for (int t = 0; t < T; ++t) - { - float maxv = -1e30f; - for (int c = 0; c < C; ++c) - maxv = std::max(maxv, a.at(b, t, c)); - float sumv = 0.0f; - for (int c = 0; c < C; ++c) - { - float e = std::exp(a.at(b, t, c) - maxv); - out.at(b, t, c) = e; - sumv += e; - } - for (int c = 0; c < C; ++c) - out.at(b, t, c) /= sumv; - } - } - return out; + size_t n = a.data.size(); + +#ifdef __AVX__ + size_t i = 0; + size_t vec_end = n & ~7ULL; + __m256 zero = _mm256_setzero_ps(); + for (; i < vec_end; i += 8) + { + __m256 va = _mm256_loadu_ps(&a.data[i]); + __m256 vc = _mm256_max_ps(va, zero); + _mm256_storeu_ps(&a.data[i], vc); + } + for (; i < n; ++i) + a.data[i] = std::max(0.0f, a.data[i]); +#elif defined(__SSE__) + size_t i = 0; + size_t vec_end = n & ~3ULL; + __m128 zero = _mm_setzero_ps(); + for (; i < vec_end; i += 4) + { + __m128 va = _mm_loadu_ps(&a.data[i]); + __m128 vc = _mm_max_ps(va, zero); + _mm_storeu_ps(&a.data[i], vc); + } + for (; i < n; ++i) + a.data[i] = std::max(0.0f, a.data[i]); +#else + for (auto &v : a.data) + v = std::max(0.0f, v); +#endif } -// Softmax along last dim for 2-D tensor [T, C] -inline Tensor softmax2d(const Tensor &a) +inline Tensor softmax3d(const Tensor &a) { - int T = a.shape[0], C = a.shape[1]; - Tensor out(a.shape); - for (int t = 0; t < T; ++t) - { + int B = a.shape[0], T = a.shape[1], C = a.shape[2]; + Tensor out(a.shape); + +#ifdef _OPENMP +#pragma omp parallel for collapse(2) if (B * T > 64) +#endif + for (int b = 0; b < B; ++b) + { + for (int t = 0; t < T; ++t) + { float maxv = -1e30f; for (int c = 0; c < C; ++c) - maxv = std::max(maxv, a.at(t, c)); + maxv = std::max(maxv, a.at(b, t, c)); + float sumv = 0.0f; for (int c = 0; c < C; ++c) { - float e = std::exp(a.at(t, c) - maxv); - out.at(t, c) = e; - sumv += e; + float e = std::exp(a.at(b, t, c) - maxv); + out.at(b, t, c) = e; + sumv += e; } + + float inv_sum = 1.0f / sumv; for (int c = 0; c < C; ++c) - out.at(t, c) /= sumv; - } - return out; + out.at(b, t, c) *= inv_sum; + } + } + return out; } -// Layer-norm along last dim [B, T, C] → same shape -inline Tensor layer_norm(const Tensor &x, - const Tensor &gamma, // [C] - const Tensor &beta, // [C] - float eps = 1e-5f) +inline Tensor softmax2d(const Tensor &a) { - int B = x.shape[0], T = x.shape[1], C = x.shape[2]; - Tensor out(x.shape); - for (int b = 0; b < B; ++b) - { - for (int t = 0; t < T; ++t) + int T = a.shape[0], C = a.shape[1]; + Tensor out(a.shape); + +#ifdef _OPENMP +#pragma omp parallel for if (T > 128) +#endif + for (int t = 0; t < T; ++t) + { + float maxv = -1e30f; + for (int c = 0; c < C; ++c) + maxv = std::max(maxv, a.at(t, c)); + + float sumv = 0.0f; + for (int c = 0; c < C; ++c) + { + float e = std::exp(a.at(t, c) - maxv); + out.at(t, c) = e; + sumv += e; + } + + float inv_sum = 1.0f / sumv; + for (int c = 0; c < C; ++c) + out.at(t, c) *= inv_sum; + } + return out; +} + +inline Tensor layer_norm(const Tensor &x, const Tensor &gamma, const Tensor &beta, float eps = 1e-5f) +{ + int B = x.shape[0], T = x.shape[1], C = x.shape[2]; + Tensor out(x.shape); + +#ifdef _OPENMP +#pragma omp parallel for collapse(2) if (B * T > 64) +#endif + for (int b = 0; b < B; ++b) + { + for (int t = 0; t < T; ++t) + { + float mu = 0.0f; + for (int c = 0; c < C; ++c) + mu += x.at(b, t, c); + mu /= C; + + float var = 0.0f; + for (int c = 0; c < C; ++c) { - float mu = 0.0f; - for (int c = 0; c < C; ++c) - mu += x.at(b, t, c); - mu /= C; - float var = 0.0f; - for (int c = 0; c < C; ++c) - { - float d = x.at(b, t, c) - mu; - var += d * d; - } - var /= C; - float inv = 1.0f / std::sqrt(var + eps); - for (int c = 0; c < C; ++c) - out.at(b, t, c) = (x.at(b, t, c) - mu) * inv * gamma.at(c) + beta.at(c); + float d = x.at(b, t, c) - mu; + var += d * d; } - } - return out; + var /= C; + + float inv = 1.0f / std::sqrt(var + eps); + for (int c = 0; c < C; ++c) + out.at(b, t, c) = (x.at(b, t, c) - mu) * inv * gamma.at(c) + beta.at(c); + } + } + return out; } -// matmul: [B, T, D] x [D, E] → [B, T, E] inline Tensor matmul(const Tensor &a, const Tensor &w) { - // a: [B, T, D] or [B, T, D] - // w: [D, E] - assert(a.ndim() == 3 && w.ndim() == 2); - int B = a.shape[0], T = a.shape[1], D = a.shape[2]; - int E = w.shape[1]; - assert(w.shape[0] == D); - Tensor out({B, T, E}, 0.0f); - for (int b = 0; b < B; ++b) - for (int t = 0; t < T; ++t) - for (int e = 0; e < E; ++e) - { - float s = 0.0f; - for (int d = 0; d < D; ++d) - s += a.at(b, t, d) * w.at(d, e); - out.at(b, t, e) = s; - } - return out; + assert(a.ndim() == 3 && w.ndim() == 2); + int B = a.shape[0], T = a.shape[1], D = a.shape[2]; + int E = w.shape[1]; + assert(w.shape[0] == D); + + Tensor out({B, T, E}, 0.0f); + + const int TILE_T = 32; + const int TILE_E = 32; + const int TILE_D = 32; + +#ifdef _OPENMP +#pragma omp parallel for collapse(2) if (B * T * E * D > 100000) +#endif + for (int b = 0; b < B; ++b) + { + for (int t0 = 0; t0 < T; t0 += TILE_T) + { + int t_end = std::min(t0 + TILE_T, T); + for (int e0 = 0; e0 < E; e0 += TILE_E) + { + int e_end = std::min(e0 + TILE_E, E); + for (int d0 = 0; d0 < D; d0 += TILE_D) + { + int d_end = std::min(d0 + TILE_D, D); + for (int t = t0; t < t_end; ++t) + { + for (int e = e0; e < e_end; ++e) + { + float s = out.at(b, t, e); + for (int d = d0; d < d_end; ++d) + s += a.at(b, t, d) * w.at(d, e); + out.at(b, t, e) = s; + } + } + } + } + } + } + return out; } -// add bias [E] broadcast over [B, T, E] inline Tensor add_bias(const Tensor &x, const Tensor &bias) { - assert(x.shape.back() == bias.shape[0]); - Tensor out = x; - int E = bias.shape[0]; - int stride = E; - int n = x.numel() / E; - for (int i = 0; i < n; ++i) - for (int e = 0; e < E; ++e) - out.data[i * stride + e] += bias.data[e]; - return out; + assert(x.shape.back() == bias.shape[0]); + Tensor out = x; + int E = bias.shape[0]; + int stride = E; + int n = x.numel() / E; + +#ifdef _OPENMP +#pragma omp parallel for if (n * E > 10000) +#endif + for (int i = 0; i < n; ++i) + { + for (int e = 0; e < E; ++e) + out.data[i * stride + e] += bias.data[e]; + } + return out; } -// batched matmul: [B, T, D] x [B, D, T2] → [B, T, T2] inline Tensor bmm(const Tensor &a, const Tensor &b) { - assert(a.ndim() == 3 && b.ndim() == 3); - int B = a.shape[0], T = a.shape[1], D = a.shape[2]; - int T2 = b.shape[2]; - assert(b.shape[0] == B && b.shape[1] == D); - Tensor out({B, T, T2}, 0.0f); - for (int bb = 0; bb < B; ++bb) - for (int t = 0; t < T; ++t) - for (int t2 = 0; t2 < T2; ++t2) - { - float s = 0.0f; - for (int d = 0; d < D; ++d) - s += a.at(bb, t, d) * b.at(bb, d, t2); - out.at(bb, t, t2) = s; - } - return out; + assert(a.ndim() == 3 && b.ndim() == 3); + int B = a.shape[0], T = a.shape[1], D = a.shape[2]; + int T2 = b.shape[2]; + assert(b.shape[0] == B && b.shape[1] == D); + + Tensor out({B, T, T2}, 0.0f); + + const int TILE = 32; + +#ifdef _OPENMP +#pragma omp parallel for if (B * T * T2 * D > 100000) +#endif + for (int bb = 0; bb < B; ++bb) + { + for (int t0 = 0; t0 < T; t0 += TILE) + { + int t_end = std::min(t0 + TILE, T); + for (int t2_0 = 0; t2_0 < T2; t2_0 += TILE) + { + int t2_end = std::min(t2_0 + TILE, T2); + for (int d0 = 0; d0 < D; d0 += TILE) + { + int d_end = std::min(d0 + TILE, D); + for (int t = t0; t < t_end; ++t) + { + for (int t2 = t2_0; t2 < t2_end; ++t2) + { + float s = out.at(bb, t, t2); + for (int d = d0; d < d_end; ++d) + s += a.at(bb, t, d) * b.at(bb, d, t2); + out.at(bb, t, t2) = s; + } + } + } + } + } + } + return out; } -// transpose last two dims of 3-D tensor [B, T, D] → [B, D, T] inline Tensor transpose23(const Tensor &a) { - int B = a.shape[0], T = a.shape[1], D = a.shape[2]; - Tensor out({B, D, T}); - for (int b = 0; b < B; ++b) + int B = a.shape[0], T = a.shape[1], D = a.shape[2]; + Tensor out({B, D, T}); + +#ifdef _OPENMP +#pragma omp parallel for collapse(2) if (B * T * D > 10000) +#endif + for (int b = 0; b < B; ++b) + { + for (int d = 0; d < D; ++d) + { for (int t = 0; t < T; ++t) - for (int d = 0; d < D; ++d) - out.at(b, d, t) = a.at(b, t, d); - return out; + out.at(b, d, t) = a.at(b, t, d); + } + } + return out; } -// concat along last dim: [B,T,D1] + [B,T,D2] → [B,T,D1+D2] inline Tensor cat_last(const std::vector &ts) { - int B = ts[0].shape[0], T = ts[0].shape[1]; - int total = 0; - for (auto &t : ts) - total += t.shape[2]; - Tensor out({B, T, total}, 0.0f); - int offset = 0; - for (auto &t : ts) - { - int D = t.shape[2]; - for (int b = 0; b < B; ++b) - for (int tt = 0; tt < T; ++tt) - for (int d = 0; d < D; ++d) - out.at(b, tt, offset + d) = t.at(b, tt, d); - offset += D; - } - return out; + int B = ts[0].shape[0], T = ts[0].shape[1]; + int total = 0; + for (auto &t : ts) + total += t.shape[2]; + + Tensor out({B, T, total}, 0.0f); + + int offset = 0; + for (auto &t : ts) + { + int D = t.shape[2]; +#ifdef _OPENMP +#pragma omp parallel for collapse(2) if (B * T * D > 10000) +#endif + for (int b = 0; b < B; ++b) + { + for (int tt = 0; tt < T; ++tt) + { + for (int d = 0; d < D; ++d) + out.at(b, tt, offset + d) = t.at(b, tt, d); + } + } + offset += D; + } + return out; } -// dropout mask (applied only during training) inline Tensor dropout(const Tensor &x, float p, bool training, std::mt19937 &rng) { - if (!training || p == 0.0f) - return x; - std::bernoulli_distribution dist(1.0f - p); - Tensor out = x; - float scale_v = 1.0f / (1.0f - p); - for (auto &v : out.data) - v = dist(rng) ? v * scale_v : 0.0f; - return out; + if (!training || p == 0.0f) + return x; + + std::bernoulli_distribution dist(1.0f - p); + Tensor out = x; + float scale_v = 1.0f / (1.0f - p); + + for (auto &v : out.data) + v = dist(rng) ? v * scale_v : 0.0f; + + return out; } \ No newline at end of file diff --git a/main.cpp b/main.cpp index 006af20..7fc540c 100644 --- a/main.cpp +++ b/main.cpp @@ -103,6 +103,22 @@ static std::string choose_output_path(const std::string &requested_path, return exe_relative; } +// sample N tokens from the model and print them +static void sample_tokens(GPTLanguageModel &model, + DataLoader &dl, + int n_tokens) +{ + std::vector ctx = {0}; + for (int i = 0; i < n_tokens; ++i) + { + ctx = model.generate(ctx, 1); + std::cout << dl.decode({ctx.back()}) << std::flush; + if ((int)ctx.size() > BLOCK_SIZE) + ctx = std::vector(ctx.end() - BLOCK_SIZE, ctx.end()); + } + std::cout << "\n"; +} + // estimate loss — no gradients, training=false static float estimate_loss(GPTLanguageModel &model, DataLoader &dl, @@ -184,10 +200,7 @@ int main(int argc, char *argv[]) std::signal(SIGINT, sig_handler); // Banner - std::cout << std::string(60, '=') << "\n"; std::cout << " Quadtrix v1.0 (C++)\n"; - std::cout << std::string(60, '=') << "\n"; - std::cout << "\n[INFO] Starting at: " << now_str() << "\n"; std::string data_path = DEFAULT_CLEANED_PATH; const char *env_data_path = std::getenv(DATA_PATH_ENV_VAR.c_str()); @@ -219,17 +232,6 @@ int main(int argc, char *argv[]) data_path = choose_existing_path(data_path, argv[0]); model_path = choose_output_path(model_path, argv[0]); - // Config print - std::cout << "\n[CONFIG] Hyperparameters:\n"; - std::cout << " batch_size=" << BATCH_SIZE - << " block_size=" << BLOCK_SIZE << "\n"; - std::cout << " max_iters=" << MAX_ITERS - << " learning_rate=" << LEARNING_RATE << "\n"; - std::cout << " n_embd=" << N_EMBD - << " n_head=" << N_HEAD - << " n_layer=" << N_LAYER - << " dropout=" << DROPOUT << "\n"; - // Data DataLoader dl; try @@ -247,13 +249,12 @@ int main(int argc, char *argv[]) GPTLanguageModel model(dl.vocab_size, N_EMBD, N_HEAD, N_LAYER, BLOCK_SIZE, SEED); long n_params = model.num_params(); - std::cout << "[MODEL] Parameters : " - << std::fixed << std::setprecision(2) - << n_params / 1.0e6f << " M (" << n_params << " total)\n"; - std::cout << "[MODEL] Architecture: " - << N_LAYER << " layers x " - << N_HEAD << " heads x " - << N_EMBD << " embedding dim\n"; + std::cout << "max_seq_len: " << BLOCK_SIZE << "\n"; + std::cout << "vocab_size: " << dl.vocab_size << "\n"; + std::cout << "num_layers: " << N_LAYER << "\n"; + std::cout << "num_heads: " << N_HEAD << "\n"; + std::cout << "channels: " << N_EMBD << "\n"; + std::cout << "num_parameters: " << n_params << "\n"; // chat mode if (chat_mode) @@ -268,9 +269,8 @@ int main(int argc, char *argv[]) } model.load(model_path); - std::cout << "[CHAT] Weights loaded from " << model_path << "\n"; - std::cout << "[CHAT] Max tokens per reply: " << chat_tokens - << " (override with --chat-tokens N)\n"; + std::cout << "weights: " << model_path << "\n"; + std::cout << "max_tokens: " << chat_tokens << "\n"; run_chat(model, dl, chat_tokens); return 0; @@ -289,10 +289,7 @@ int main(int argc, char *argv[]) } model.load(model_path); - std::cout << "\n" - << std::string(60, '-') << "\n"; - std::cout << " Quadtrix OUTPUT (Ctrl+C to stop)\n"; - std::cout << std::string(60, '-') << "\n\n"; + std::cout << "\ngenerating:\n"; std::vector ctx = {0}; while (!g_interrupted) { @@ -301,7 +298,7 @@ int main(int argc, char *argv[]) if ((int)ctx.size() > BLOCK_SIZE) ctx = std::vector(ctx.end() - BLOCK_SIZE, ctx.end()); } - std::cout << "\n\n[Stopped by user]\n"; + std::cout << "\n"; return 0; } @@ -312,114 +309,78 @@ int main(int argc, char *argv[]) std::mt19937 rng(SEED); // training loop - std::cout << "\n" - << std::string(60, '-') << "\n"; - std::cout << " TRAINING (" - << MAX_ITERS << " iters, eval every " - << EVAL_INTERVAL << ")\n"; - std::cout << std::string(60, '-') << "\n"; float best_val_loss = 1e30f; + float last_val_loss = 0.0f; double train_start = wall_secs(); - double last_eval_time = train_start; // ← tracks time of previous eval - for (int iter = 0; iter <= MAX_ITERS && !g_interrupted; ++iter) + // compute initial val loss before training { + std::mt19937 init_rng(SEED); + last_val_loss = estimate_loss(model, dl, "val", init_rng); + } - // Periodic eval checkpoint - if (iter % EVAL_INTERVAL == 0 || iter == MAX_ITERS) - { - double now = wall_secs(); - double elapsed = now - train_start; - - // ms per training step since the last eval window - double window_secs = now - last_eval_time; - int steps_in_win = (iter == 0) ? 1 : EVAL_INTERVAL; - double ms_per_step = window_secs * 1000.0 / steps_in_win; - - // tokens processed per second - long toks_in_win = (long)BATCH_SIZE * BLOCK_SIZE * steps_in_win; - int tok_per_sec = (window_secs > 0.0) - ? (int)(toks_in_win / window_secs) - : 0; - - last_eval_time = now; // reset window - - float tl = estimate_loss(model, dl, "train", rng); - float vl = estimate_loss(model, dl, "val", rng); - - bool better = vl < best_val_loss; - if (better) - { - best_val_loss = vl; - model.save(model_path); - } - - // ── new log line ───────────────────────────────────────────── - std::cout - << "step " - << std::setw(5) << iter << "/" << MAX_ITERS - << " | loss " - << std::fixed << std::setprecision(6) << tl - << " | val " - << std::fixed << std::setprecision(6) << vl - << " | lr " - << std::scientific << std::setprecision(2) << (float)LEARNING_RATE - << " | " - << std::fixed << std::setprecision(2) << ms_per_step << " ms" - << " | " << tok_per_sec << " tok/s" - << (better ? " *best*" : "") - << "\n"; - std::cout.flush(); - - if (iter == MAX_ITERS) - break; - } + for (int iter = 1; iter <= MAX_ITERS && !g_interrupted; ++iter) + { + double step_start = wall_secs(); - // Sample training batch + // train step std::pair, std::vector> batch = dl.get_batch("train", BATCH_SIZE, BLOCK_SIZE, rng); - // Forward — saves all intermediate activations SavedForward saved = forward_save(model, batch.first, BATCH_SIZE, BLOCK_SIZE, batch.second, /*training=*/true); - // Backward — exact analytical gradients - Grads grads = backward(model, saved); + float batch_loss = model.forward(batch.first, BATCH_SIZE, BLOCK_SIZE, + batch.second, false) + .second; - // AdamW parameter update + Grads grads = backward(model, saved); apply_grads(model, grads, opt); - } - double total = wall_secs() - train_start; - std::cout << "\n[DONE] Training finished in " - << std::fixed << std::setprecision(1) << total << "s (" - << total / 60.0 << " min) | Best val loss: " - << std::setprecision(4) << best_val_loss << "\n"; - std::cout << "[SAVE] Best weights saved to " << model_path << "\n"; + double step_ms = (wall_secs() - step_start) * 1000.0; + int tok_per_sec = (step_ms > 0.0) + ? (int)((long)BATCH_SIZE * BLOCK_SIZE / (step_ms / 1000.0)) + : 0; - // Continuous generation - std::cout << "\n" - << std::string(60, '-') << "\n"; - std::cout << " MODEL OUTPUT (Ctrl+C to stop)\n"; - std::cout << std::string(60, '-') << "\n\n"; + // every EVAL_INTERVAL steps: compute val, save if best, sample + bool better = false; + if (iter % EVAL_INTERVAL == 0 || iter == MAX_ITERS) + { + last_val_loss = estimate_loss(model, dl, "val", rng); + if (last_val_loss < best_val_loss) + { + best_val_loss = last_val_loss; + model.save(model_path); + better = true; + } + } - model.load(model_path); - model.rng = std::mt19937(SEED + 42); + // print every step + std::cout + << "step" + << std::setw(5) << iter << "/" << MAX_ITERS + << " | loss " + << std::fixed << std::setprecision(6) << batch_loss + << " | val " + << std::fixed << std::setprecision(6) << last_val_loss + << " | lr " + << std::scientific << std::setprecision(2) << (float)LEARNING_RATE + << " | " + << std::fixed << std::setprecision(2) << step_ms << " ms" + << " | " << tok_per_sec << " tok/s" + << (better ? " *best*" : "") + << "\n"; + std::cout.flush(); - std::vector ctx = {0}; - while (!g_interrupted) - { - ctx = model.generate(ctx, 1); - std::cout << dl.decode({ctx.back()}) << std::flush; - if ((int)ctx.size() > BLOCK_SIZE) - ctx = std::vector(ctx.end() - BLOCK_SIZE, ctx.end()); + // sample after every eval window + if (iter % EVAL_INTERVAL == 0 || iter == MAX_ITERS) + { + std::cout << "generating:\n"; + sample_tokens(model, dl, iter == MAX_ITERS ? 10000 : 150); + } } - std::cout << "\n\n[Stopped by user]\n"; - std::cout << "[TOTAL] Wall-clock: " - << std::fixed << std::setprecision(1) - << (wall_secs() - train_start) << "s\n"; return 0; } \ No newline at end of file diff --git a/run.md b/run.md deleted file mode 100644 index a2c0e65..0000000 --- a/run.md +++ /dev/null @@ -1,492 +0,0 @@ -# Quadtrix.cpp - -Quadtrix.cpp is a local GPT-style language model project with multiple runtime paths: - -- Native C++ inference and training through `Quadtrix.exe` / `main.cpp` -- PyTorch checkpoint inference through `engine/inference.py` and `engine/best_model .pt` -- FastAPI middleware in `backend/` -- React + TypeScript chat UI in `frontend/` - -The web interface can chat with both model backends: - -- `C++`: calls the C++ HTTP server on port `8080` -- `.pt`: loads the PyTorch checkpoint directly from `engine/best_model .pt` - -## Project Layout - -```text -Quadtrix.cpp/ - Quadtrix.exe - main.cpp - config/ - include/ - data/ - engine/ - inference.py - main.py - fine-tune/main.py - best_model .pt - fineweb_30mb.txt - backend/ - main.py - inference.py - requirements.txt - frontend/ - package.json - src/ -``` - -## Requirements - -- Python 3.10+ -- Node.js 18+ -- npm -- C++17 compiler if you want to rebuild the C++ executable - -## 1. Python Setup - -From the repo root: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp -python -m venv .venv -.\.venv\Scripts\python.exe -m pip install --upgrade pip -``` - -Install backend and PyTorch inference dependencies: - -```powershell -cd backend -..\.venv\Scripts\python.exe -m pip install -r requirements.txt -``` - -## 2. Frontend Setup - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend -npm.cmd install -npm.cmd run build -``` - -Run the frontend: - -```powershell -npm.cmd run dev -``` - -Frontend URL: - -```text -http://localhost:5173 -``` - -## Install as a Web App - -The frontend is configured as an installable PWA. It includes: - -- `frontend/manifest.webmanifest` -- `frontend/sw.js` -- `frontend/public/manifest.webmanifest` -- `frontend/public/sw.js` -- service worker registration in `frontend/src/registerServiceWorker.ts` - -For the clean installable version, build and preview the frontend: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend -npm.cmd run build -npm.cmd run preview -``` - -Open the preview URL, usually: - -```text -http://localhost:4173 -``` - -Then install from the browser: - -- Chrome / Edge: click the install icon in the address bar -- Or open browser menu -> Apps -> Install this site as an app - -The installed app still talks to the backend at: - -```text -http://localhost:3001 -``` - -So keep the FastAPI backend running when chatting. - -## 3. Run the PyTorch `.pt` Model in the Web UI - -The `.pt` model does not need a separate model server. The FastAPI backend loads it directly from: - -```text -engine/best_model .pt -``` - -Start the backend: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\backend -..\.venv\Scripts\python.exe -m uvicorn main:app --host 127.0.0.1 --port 3001 -``` - -Start the frontend in another terminal: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend -npm.cmd run dev -``` - -Open: - -```text -http://localhost:5173 -``` - -Select `.pt` in the top bar. - -## 4. Run the C++ Model in the Web UI - -Start the C++ inference server: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp -.\Quadtrix.exe data\input.txt --server --port 8080 -``` - -Start the backend: - -```powershell -cd backend -..\.venv\Scripts\python.exe -m uvicorn main:app --host 127.0.0.1 --port 3001 -``` - -Start the frontend: - -```powershell -cd ..\frontend -npm.cmd run dev -``` - -Open: - -```text -http://localhost:5173 -``` - -Select `C++` in the top bar. - -## 5. Run Both Backends Together - -Use three terminals. - -Terminal 1: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp -.\Quadtrix.exe data\input.txt --server --port 8080 -``` - -Terminal 2: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\backend -..\.venv\Scripts\python.exe -m uvicorn main:app --host 127.0.0.1 --port 3001 -``` - -Terminal 3: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend -npm.cmd run dev -``` - -Open: - -```text -http://localhost:5173 -``` - -Switch between `C++` and `.pt` from the model selector. - -## 6. Backend API - -Base URL: - -```text -http://localhost:3001 -``` - -Routes: - -```text -GET /api/health -GET /api/stats -POST /api/chat -GET /api/sessions -POST /api/sessions -DELETE /api/sessions/{id} -GET /api/sessions/{id}/messages -POST /api/feedback -``` - -Example `.pt` chat request: - -```powershell -Invoke-RestMethod ` - -Uri http://localhost:3001/api/chat ` - -Method Post ` - -ContentType "application/json" ` - -Body '{ - "session_id": null, - "prompt": "Once upon a time", - "max_tokens": 100, - "temperature": 1.0, - "stream": false, - "model_backend": "torch" - }' -``` - -Example C++ chat request: - -```powershell -Invoke-RestMethod ` - -Uri http://localhost:3001/api/chat ` - -Method Post ` - -ContentType "application/json" ` - -Body '{ - "session_id": null, - "prompt": "Once upon a time", - "max_tokens": 100, - "temperature": 1.0, - "stream": false, - "model_backend": "cpp" - }' -``` - -## 7. Environment Variables - -Backend defaults are in `backend/.env.example`: - -```text -API_PORT=3001 -CORS_ORIGINS=http://localhost:5173 -REDIS_URL= -LOG_LEVEL=INFO -MAX_SESSIONS=1000 -SESSION_TTL_HOURS=24 -CPP_SERVER_URL=http://localhost:8080 -TORCH_CHECKPOINT_PATH=../engine/best_model .pt -REQUEST_TIMEOUT_SECONDS=60 -``` - -Create `backend/.env` if you want overrides. - -Frontend defaults are in `frontend/.env.example`: - -```text -VITE_API_BASE_URL=http://localhost:3001 -``` - -## 8. PyTorch CLI Inference - -Interactive chat: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp -.\.venv\Scripts\python.exe engine\inference.py --checkpoint "engine\best_model .pt" -``` - -Generate once: - -```powershell -.\.venv\Scripts\python.exe engine\inference.py --checkpoint "engine\best_model .pt" --prompt "Hello" --max-new-tokens 100 --temperature 1.0 -``` - -## 9. PyTorch Training - -Main training: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp -.\.venv\Scripts\python.exe engine\main.py -``` - -Fine-tuning: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp -.\.venv\Scripts\python.exe engine\fine-tune\main.py -``` - -## 10. C++ Build and Run - -Build manually: - -```powershell -g++ -std=c++17 -O2 -I. -Iinclude -o Quadtrix.exe main.cpp -``` - -Train from scratch: - -```powershell -.\Quadtrix.exe data\input.txt -``` - -Terminal chat: - -```powershell -.\Quadtrix.exe data\input.txt --chat -``` - -Raw generation: - -```powershell -.\Quadtrix.exe data\input.txt --generate -``` - -HTTP server: - -```powershell -.\Quadtrix.exe data\input.txt --server --port 8080 -``` - -## 11. Health Checks - -Backend: - -```powershell -Invoke-RestMethod http://localhost:3001/api/health -``` - -C++ server: - -```powershell -Invoke-RestMethod http://localhost:8080/health -``` - -Frontend: - -```text -http://localhost:5173 -``` - -When only `.pt` is available, backend health should show: - -```json -{ - "status": "degraded", - "api": "ok", - "cpp_server": "unreachable", - "torch_model": "ok" -} -``` - -When both are available, backend health should show: - -```json -{ - "status": "ok", - "api": "ok", - "cpp_server": "ok", - "torch_model": "ok" -} -``` - -## 12. Troubleshooting - -### PowerShell blocks `npm` - -Use `npm.cmd`: - -```powershell -npm.cmd run dev -npm.cmd run build -``` - -### `.pt` model is unavailable - -Check that this file exists: - -```text -engine/best_model .pt -``` - -Then check Python dependencies: - -```powershell -cd backend -..\.venv\Scripts\python.exe -c "import torch, tiktoken; print(torch.__version__)" -``` - -### Backend cannot import FastAPI - -Install dependencies into the repo venv: - -```powershell -cd backend -..\.venv\Scripts\python.exe -m pip install -r requirements.txt -``` - -### C++ option is offline - -Start the C++ server: - -```powershell -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp -.\Quadtrix.exe data\input.txt --server --port 8080 -``` - -### Frontend cannot reach backend - -Check: - -```text -http://localhost:3001/api/health -``` - -Make sure frontend config points to: - -```text -VITE_API_BASE_URL=http://localhost:3001 -``` - -### Port already in use - -```powershell -Get-NetTCPConnection -LocalPort 3001 -Get-NetTCPConnection -LocalPort 5173 -Get-NetTCPConnection -LocalPort 8080 -``` - -## Recommended Daily Run - -```powershell -# Terminal 1 -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp -.\Quadtrix.exe data\input.txt --server --port 8080 -``` - -```powershell -# Terminal 2 -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\backend -..\.venv\Scripts\python.exe -m uvicorn main:app --host 127.0.0.1 --port 3001 -``` - -```powershell -# Terminal 3 -cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend -npm.cmd run dev -``` - -Open: - -```text -http://localhost:5173 -``` - -## License - -MIT diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100644 index 0000000..e36678b --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,161 @@ + +# Quadtrix.cpp — build.sh +# Usage +# ./scripts/build.sh # full stack, CPU +# ./scripts/build.sh dev # hot-reload dev mode +# ./scripts/build.sh gpu # CUDA backend +# ./scripts/build.sh cpp-only # compile + run C++ engine +# ./scripts/build.sh train-cpp # train with C++ backend +# ./scripts/build.sh train-torch # train with PyTorch backend +# ./scripts/build.sh bench # run benchmark +# ./scripts/build.sh clean # remove containers + volumes +# ./scripts/build.sh logs # tail all service logs + +set -euo pipefail + +BOLD="\033[1m" +GREEN="\033[0;32m" +CYAN="\033[0;36m" +YELLOW="\033[1;33m" +RED="\033[0;31m" +RESET="\033[0m" + +info() { echo -e "${CYAN}[quadtrix]${RESET} $*"; } +success() { echo -e "${GREEN}[quadtrix]${RESET} $*"; } +warn() { echo -e "${YELLOW}[quadtrix]${RESET} $*"; } +error() { echo -e "${RED}[quadtrix] ERROR:${RESET} $*" >&2; } + +COMPOSE_BASE="docker compose -f docker-compose.yml" +COMPOSE_DEV="${COMPOSE_BASE} -f docker-compose.dev.yml" +COMPOSE_GPU="${COMPOSE_BASE} -f docker-compose.gpu.yml" + +check_docker() { + if ! docker info &>/dev/null; then + error "Docker daemon is not running. Start Docker Desktop or the Docker service." + exit 1 + fi +} + +check_nvidia() { + if ! command -v nvidia-smi &>/dev/null; then + warn "nvidia-smi not found — GPU mode may not work." + else + info "GPU detected: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)" + fi +} + +pull_cache() { + info "Pulling build cache images (if available)..." + $COMPOSE_BASE pull --ignore-pull-failures 2>/dev/null || true +} + +cmd_up() { + check_docker + info "Starting full stack (CPU)..." + $COMPOSE_BASE up --build -d + success "Stack is up." + echo "" + echo -e " ${BOLD}Frontend:${RESET} http://localhost:5173" + echo -e " ${BOLD}API:${RESET} http://localhost:3001/api/health" + echo -e " ${BOLD}Docs:${RESET} http://localhost:3001/docs" +} + +cmd_dev() { + check_docker + info "Starting in DEV mode (hot-reload)..." + $COMPOSE_DEV up --build +} + +cmd_gpu() { + check_docker + check_nvidia + info "Starting with CUDA GPU support..." + $COMPOSE_GPU up --build -d + success "GPU stack is up." +} + +cmd_cpp_only() { + check_docker + info "Compiling and running C++ engine..." + $COMPOSE_BASE --profile cpp run --rm cpp "$@" +} + +cmd_train_cpp() { + check_docker + info "Training with C++ backend..." + $COMPOSE_BASE --profile train run --rm train-cpp + success "C++ training complete. Checkpoint saved in 'models' volume." +} + +cmd_train_torch() { + check_docker + info "Training with PyTorch backend..." + $COMPOSE_BASE --profile train run --rm train-torch + success "PyTorch training complete. Checkpoint saved in 'models' volume." +} + +cmd_bench() { + check_docker + info "Running benchmark..." + $COMPOSE_BASE --profile benchmark run --rm benchmark +} + +cmd_logs() { + check_docker + $COMPOSE_BASE logs -f --tail=100 +} + +cmd_clean() { + check_docker + warn "This will remove all containers and volumes (including saved models!)" + read -r -p "Are you sure? [y/N] " confirm + if [[ "${confirm,,}" == "y" ]]; then + $COMPOSE_BASE down -v --remove-orphans + docker image prune -f --filter "label=org.opencontainers.image.source=https://github.com/Eamon2009/Quadtrix.cpp" + success "Cleaned." + else + info "Aborted." + fi +} + +cmd_ps() { + $COMPOSE_BASE ps +} + +cmd_shell() { + service="${1:-backend}" + info "Opening shell in '${service}'..." + $COMPOSE_BASE exec "${service}" /bin/sh +} +CMD="${1:-up}" +shift || true + +case "${CMD}" in + up) cmd_up "$@" ;; + dev) cmd_dev "$@" ;; + gpu) cmd_gpu "$@" ;; + cpp-only) cmd_cpp_only "$@" ;; + train-cpp) cmd_train_cpp "$@" ;; + train-torch) cmd_train_torch "$@" ;; + bench) cmd_bench "$@" ;; + logs) cmd_logs "$@" ;; + clean) cmd_clean "$@" ;; + ps) cmd_ps "$@" ;; + shell) cmd_shell "$@" ;; + *) + echo -e "Usage: ./scripts/build.sh ${BOLD}[command]${RESET}" + echo "" + echo "Commands:" + echo " up Full stack (CPU) — default" + echo " dev Hot-reload dev mode" + echo " gpu CUDA GPU stack" + echo " cpp-only Run C++ engine CLI" + echo " train-cpp Train with C++ backend" + echo " train-torch Train with PyTorch" + echo " bench Benchmark" + echo " logs Tail logs" + echo " ps Show container status" + echo " shell [svc] Shell into service (default: backend)" + echo " clean Remove all containers + volumes" + ;; +esac