From 4ebd73f067cf52f46b7774238107dd95d0d14224 Mon Sep 17 00:00:00 2001
From: Eamon <eamon112009@gmail.com>
Date: Sun, 31 May 2026 19:26:54 +0530
Subject: [PATCH] exp(#58)

* feat(ci): optimize workflow pipeline and update docker configurations

* feat(ci): optimize workflow pipeline and update docker configurations

* feat(ci): optimize workflow pipeline and update docker configurations

* feat(ci): optimize workflow pipeline and update docker configurations

* feat(ci): optimize workflow pipeline and update docker configurations

* feat(ci): optimize workflow pipeline and update docker configurations

* feat(ci): optimize workflow pipeline and update docker configurations

* feat(ci): optimize workflow pipeline and update docker configurations

* refactor(ci): optimize workflow pipeline and update docker configurations

* refactor : optimize workflow pipeline and update docker configurations

* refactor : optimize workflow pipeline and update docker configurations

* refactor : optimize workflow pipeline and update docker configurations

* Added MIT LICENSE to this project Quadtrix.cpp

* Refactor Dockerfile to use ARG for CUDA version

* Refactor Dockerfile for backend dependencies

* refactor : Dockerfile.backend optimize workflow pipeline

* refactor : Dockerfile.backend optimize workflow pipeline

* refactor : Dockerfile.backend optimize workflow pipeline

* refactor : Dockerfile.backend optimize workflow pipeline

* Delete .devops/Dockerfile.frontend

* Delete .devops/Dockerfile.dev.frontend

* refactor : Dockerfile.backend optimize workflow pipeline

* refactor : Dockerfile.backend optimize workflow pipeline

* refactored (CI): consolidated manual Docker build jobs into a matrix strategy to reduce duplication

* refactored (CI): consolidated manual Docker build jobs into a matrix strategy to reduce duplication

* refactor(ui): rewrite ThinkingIndicator to use inline styles and CSS keyframes

* refactor : message bubble layout to use inline styles

* refactor(ui): complete inline-style migration and update auto-scroll implementation

* refactor(ui): complete inline-style migration for MessageAvatar component

* refactor(ui): rewrite EmptyState component using pure inline styles

* refactored(tensor): vectorize element-wise addition and scalar scaling using AVX/SSE

- Added SIMD vectorization support (`__AVX__` and `__SSE__`) for element-wise `add`, `add_inplace`, and `scale` operations.
- Maintained scalar fallback paths for non-vectorized bounds and platforms lacking hardware extensions.
- Explicitly defined rule-of-five constructors (`default` and `noexcept` moves) within the `Tensor` struct layout.
- Optimized vector initialization across the core construct layer via `std::move` and `std::vector::reserve`.

* refactor(main): redesign training loop to log per-step and sample during evaluation

- Replaced the periodic block evaluation layout with standard, per-step logging metrics (`loss`, `ms`, and `tok/s`).
- Shifted initial validation loss calculation out of the iteration cycle to establish a zero-state baseline.
- Restructured token streaming so that generations are triggered conditionally inside the training loop post-evaluation windows.
- Streamlined architecture parameter reporting and consolidated command-line configuration visual prints.

* feat: implement GPT training loop with multi-GPU and memory optimizations

- Add advanced memory footprint optimization using forward-activation recomputation for LayerNorm and GeLU.
- Optimize layer-wise activation buffer layout using a centralized `TensorSpec` registry to support large batch scaling.
- Integrate cuBLASLt matmul fusions, optional cuDNN attention layers, and stochastic rounding options.
- Fall back gracefully to `cudaMallocManaged` under heavy loads to prevent Outlier/OOM crashes.

* Update README.md with new banner for qudtrix.cpp

---------

Co-authored-by: Max <eamon5174@gmail.com>
---
 Dockerfile => .devops/Dockerfile              |    2 +-
 Dockerfile.cuda => .devops/Dockerfile.backend |    0
 .devops/Dockerfile.cpp                        |   65 +
 .devops/nginx.conf                            |   47 +
 .dockerignore                                 |   57 +-
 .github/workflows/ci.yml                      |  238 +-
 .github/workflows/docker-publish.yml          |  163 +-
 .github/workflows/pr-check.yml                |  238 ++
 CUDA/main.cu                                  | 2070 +++++++++++++++++
 LICENSE                                       |    2 +-
 Makefile                                      |  104 +
 README.md                                     |    4 +
 config/config.h                               |   20 +-
 docker-compose.dev.yml                        |   45 +
 docker-compose.gpu.yml                        |   32 +
 docker-compose.yml                            |  181 +-
 frontend/src/components/chat/EmptyState.tsx   |   96 +-
 .../src/components/chat/MessageAvatar.tsx     |   45 +-
 frontend/src/components/chat/MessageList.tsx  |   21 +-
 frontend/src/components/chat/MessageRow.tsx   |   87 +-
 .../src/components/chat/ThinkingIndicator.tsx |   28 +-
 include/tensor.h                              |  749 ++++--
 main.cpp                                      |  193 +-
 run.md                                        |  492 ----
 scripts/build.sh                              |  161 ++
 25 files changed, 4077 insertions(+), 1063 deletions(-)
 rename Dockerfile => .devops/Dockerfile (94%)
 rename Dockerfile.cuda => .devops/Dockerfile.backend (100%)
 create mode 100644 .devops/Dockerfile.cpp
 create mode 100644 .devops/nginx.conf
 create mode 100644 .github/workflows/pr-check.yml
 create mode 100644 CUDA/main.cu
 create mode 100644 Makefile
 create mode 100644 docker-compose.dev.yml
 create mode 100644 docker-compose.gpu.yml
 delete mode 100644 run.md
 create mode 100644 scripts/build.sh

diff --git a/Dockerfile b/.devops/Dockerfile
similarity index 94%
rename from Dockerfile
rename to .devops/Dockerfile
index 65fcca9..c7c0061 100644
--- a/Dockerfile
+++ b/.devops/Dockerfile
@@ -35,4 +35,4 @@ COPY . .
 ENV PATH="/app/venv/bin:$PATH"
 ENV PYTHONUNBUFFERED=1
 
-ENTRYPOINT ["python3", "engine/main.py"]   
\ No newline at end of file
+ENTRYPOINT ["python3", "engine/main.py"]   
diff --git a/Dockerfile.cuda b/.devops/Dockerfile.backend
similarity index 100%
rename from Dockerfile.cuda
rename to .devops/Dockerfile.backend
diff --git a/.devops/Dockerfile.cpp b/.devops/Dockerfile.cpp
new file mode 100644
index 0000000..0a1ce15
--- /dev/null
+++ b/.devops/Dockerfile.cpp
@@ -0,0 +1,65 @@
+
+FROM ubuntu:24.04 AS builder
+
+LABEL stage=builder
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG BUILD_TYPE=Release
+ARG CMAKE_EXTRA_FLAGS=""
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    g++ \
+    cmake \
+    ninja-build \
+    ccache \
+    git \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+
+COPY main.cpp        ./
+COPY benchmark.cpp   ./
+COPY config/         ./config/
+COPY include/        ./include/
+COPY data/           ./data/
+
+# If model/Cmakelists.txt exists, use cmake; else fall back to direct g++
+RUN set -e; \
+    if [ -f model/Cmakelists.txt ] || [ -f CMakeLists.txt ]; then \
+    cmake -B build -G Ninja \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+    ${CMAKE_EXTRA_FLAGS} .; \
+    cmake --build build --parallel "$(nproc)"; \
+    else \
+    g++ -std=c++17 -O3 -march=native \
+    -I. -Iinclude \
+    -o /usr/local/bin/quadtrix \
+    main.cpp; \
+    fi
+FROM ubuntu:24.04 AS runtime
+
+LABEL org.opencontainers.image.title="Quadtrix.cpp Engine"
+LABEL org.opencontainers.image.description="C++ transformer engine for local LM inference"
+LABEL org.opencontainers.image.source="https://github.com/Eamon2009/Quadtrix.cpp"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libstdc++6 \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY --from=builder /usr/local/bin/quadtrix /usr/local/bin/quadtrix
+COPY --from=builder /src/data/ ./data/
+VOLUME ["/models"]
+
+ENV GPT_DATA_PATH=/app/data/input.txt \
+    GPT_MODEL_PATH=/models/best_model.bin
+
+EXPOSE 8080
+
+ENTRYPOINT ["/usr/local/bin/quadtrix"]
+CMD ["data/input.txt", "--chat"]
diff --git a/.devops/nginx.conf b/.devops/nginx.conf
new file mode 100644
index 0000000..5804e6e
--- /dev/null
+++ b/.devops/nginx.conf
@@ -0,0 +1,47 @@
+# Quadtrix.cpp — Nginx config
+# Serves the Vite SPA and proxies /api/* to the FastAPI backend
+
+server {
+    listen 80;
+    server_name _;
+
+    root /usr/share/nginx/html;
+    index index.html;
+
+    # Gzip
+    gzip on;
+    gzip_types text/plain text/css application/json application/javascript
+               text/xml application/xml application/xml+rss text/javascript
+               application/wasm;
+    gzip_min_length 1024;
+
+    # SPA fallback — all unknown routes return index.html
+    location / {
+        try_files $uri $uri/ /index.html;
+    }
+
+    # Proxy API calls to FastAPI backend
+    location /api/ {
+        proxy_pass         http://backend:3001;
+        proxy_http_version 1.1;
+        proxy_set_header   Host              $host;
+        proxy_set_header   X-Real-IP         $remote_addr;
+        proxy_set_header   X-Forwarded-For   $proxy_add_x_forwarded_for;
+        proxy_set_header   X-Forwarded-Proto $scheme;
+        proxy_set_header   Upgrade           $http_upgrade;
+        proxy_set_header   Connection        "upgrade";
+        proxy_read_timeout 120s;
+        proxy_send_timeout 120s;
+    }
+
+    # Static asset cache
+    location ~* \.(js|css|png|svg|ico|woff2|woff|ttf|webmanifest)$ {
+        expires 1y;
+        add_header Cache-Control "public, immutable";
+    }
+
+    # Service worker must not be cached
+    location = /sw.js {
+        add_header Cache-Control "no-cache";
+    }
+}
diff --git a/.dockerignore b/.dockerignore
index f001789..603874e 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,35 +1,44 @@
+
 .git
 .gitignore
 .github
 .venv
-**/__pycache__
-**/*.pyc
-**/*.pyo
-**/*.pyd
-engine/logs/
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.egg-info
+.pytest_cache
+.ruff_cache
+dist/
+build/
+*.egg
 node_modules
 frontend/node_modules
-.npm-cache
-frontend/.vite
 frontend/dist
-
-#  Model weights 
-*.pt
-*.bin
-models/
-
-#  Windows build artifacts
-*.exe
+frontend/.vite
+*.npm-cache
+.npmignore
+*.o
+*.a
+*.so
+*.dylib
 quadtrix.exe
-*.png
-*.jpg
-*.jpeg
-*.md
-LICENSE
-contributing.md
-SECURITY.md
-run.md
+quadtrix
+build/
+cmake-build-*/
+.vscode
+*.bin
+*.pt
+*.gguf
+*.safetensors
+engine/best_model.pt
+engine/logs/
+engine/fineweb_30mb.txt
+data/input.txt
 .DS_Store
 Thumbs.db
+*.swp
+*.swo
 .idea
-.vscode
\ No newline at end of file
+docker-compose.override.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 311ad33..bf49286 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,74 +2,216 @@ name: CI
 
 on:
   push:
-    branches:
-      - exp
-      - master
-  pull_request:
-
-permissions:
-  contents: read
+    branches: [master, dev]
+  workflow_dispatch:
+    inputs:
+      image:
+        description: "Which image to build?"
+        required: true
+        type: choice
+        options:
+          - cpp
+          - cpu
+          - cuda
+          - all
+      push:
+        description: "Push to ghcr.io?"
+        required: true
+        default: "true"
+        type: choice
+        options: ["true", "false"]
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/quadtrix
 
 jobs:
-  cpp-build:
-    name: C++ build
+
+  file-integrity:
+    name: File integrity
+    if: github.event_name == 'push'
     runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
 
+      - name: Check required files exist
+        run: |
+          files=(
+            "main.cpp"
+            "engine/main.py"
+            "requirements.txt"
+          )
+          failed=0
+          for f in "${files[@]}"; do
+            if [ -f "$f" ]; then
+              echo "✅  $f"
+            else
+              echo "❌  $f — MISSING"
+              failed=1
+            fi
+          done
+          exit $failed
+
+
+  lint-python:
+    name: Python lint
+    if: github.event_name == 'push'
+    runs-on: ubuntu-latest
     steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
 
-      - name: Install compiler
-        run: sudo apt-get update && sudo apt-get install -y g++
+      - name: Lint engine/ (ruff)
+        uses: chartboost/ruff-action@v1
+        with:
+          args: "check engine/ --ignore E501 --exit-zero"
 
-      - name: Build Quadtrix
-        run: g++ -std=c++17 -O2 -I. -Iinclude -o quadtrix main.cpp
 
-  backend-smoke:
-    name: Backend smoke checks
+  build-cpp:
+    name: C++ compile check
+    if: github.event_name == 'push'
     runs-on: ubuntu-latest
-
     steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
 
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
+      - name: Install g++
+        run: sudo apt-get update && sudo apt-get install -y g++
 
-      - name: Install backend runtime dependencies
+      - name: Compile main.cpp
         run: |
-          python -m pip install --upgrade pip
-          pip install fastapi "uvicorn[standard]" pydantic pydantic-settings httpx redis
+          g++ -std=c++17 -O3 \
+            -I. -Iinclude \
+            -o quadtrix main.cpp
 
-      - name: Compile Python sources
-        run: python -m compileall backend engine
+      - name: Smoke test
+        run: ./quadtrix --help || true
 
-      - name: Import FastAPI application
-        working-directory: backend
-        run: |
-          python -c "from main import app; print(app.title)"
 
-  frontend-build:
-    name: Frontend build
+  build-cpp-image:
+    name: Build — cpp
+    if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cpp' || inputs.image == 'all')
     runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: docker/setup-qemu-action@v3
+      - uses: docker/setup-buildx-action@v3
+
+      - name: Login to GHCR
+        if: inputs.push == 'true'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.IMAGE_PREFIX }}-cpp
+          tags: |
+            type=ref,event=branch
+            type=sha,prefix=sha-
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build & push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: .devops/Dockerfile.cpp
+          platforms: linux/amd64,linux/arm64
+          push: ${{ inputs.push == 'true' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha,scope=cpp
+          cache-to: type=gha,mode=max,scope=cpp
+
+
+  build-cpu-image:
+    name: Build — cpu
+    if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cpu' || inputs.image == 'all')
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
     steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
+
+      - uses: docker/setup-qemu-action@v3
+      - uses: docker/setup-buildx-action@v3
+
+      - name: Login to GHCR
+        if: inputs.push == 'true'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
         with:
-          node-version: "20"
-          cache: "npm"
-          cache-dependency-path: frontend/package-lock.json
+          images: ${{ env.IMAGE_PREFIX }}-cpu
+          tags: |
+            type=ref,event=branch
+            type=sha,prefix=sha-
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build & push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: .devops/Dockerfile
+          platforms: linux/amd64,linux/arm64
+          push: ${{ inputs.push == 'true' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha,scope=cpu
+          cache-to: type=gha,mode=max,scope=cpu
+
+
+  build-cuda-image:
+    name: Build — cuda
+    if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cuda' || inputs.image == 'all')
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
 
-      - name: Install frontend dependencies
-        working-directory: frontend
-        run: npm ci
+      - uses: docker/setup-buildx-action@v3
 
-      - name: Build frontend
-        working-directory: frontend
-        run: npm run build
+      - name: Login to GHCR
+        if: inputs.push == 'true'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.IMAGE_PREFIX }}-cuda
+          tags: |
+            type=ref,event=branch
+            type=sha,prefix=sha-
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build & push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: .devops/Dockerfile.backend
+          platforms: linux/amd64
+          push: ${{ inputs.push == 'true' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha,scope=cuda
+          cache-to: type=gha,mode=max,scope=cuda
\ No newline at end of file
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index 1431739..ca9493f 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -1,73 +1,132 @@
-name: Publish Docker image
+name: Release
+
 on:
-  workflow_dispatch:      
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Version tag (e.g. 1.2.3)"
+        required: true
+
 env:
   REGISTRY: ghcr.io
+  IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/quadtrix
+
 jobs:
-  build-and-push:
-    name: Build & push (${{ matrix.variant }})
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
+
+  build-binaries:
+    name: Binary (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
     strategy:
-      fail-fast: false
       matrix:
+        os: [ubuntu-22.04, macos-14]
         include:
-          - variant: cpu
-            dockerfile: Dockerfile
-            tag_suffix: ""
-          - variant: cuda
-            dockerfile: Dockerfile.cuda
-            tag_suffix: "-cuda"
+          - os: ubuntu-22.04
+            artifact_name: quadtrix-linux-x64
+            binary: quadtrix
+          - os: macos-14
+            artifact_name: quadtrix-macos-arm64
+            binary: quadtrix
     steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Set lowercase image name
-        id: image
+      - uses: actions/checkout@v4
+
+      - name: Compile (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get update && sudo apt-get install -y g++
+          g++ -std=c++17 -O3 -march=native \
+              -I. -Iinclude \
+              -o ${{ matrix.binary }} main.cpp
+          strip ${{ matrix.binary }}
+
+      - name: Compile (macOS)
+        if: runner.os == 'macOS'
+        run: |
+          g++ -std=c++17 -O3 -march=native \
+              -I. -Iinclude \
+              -o ${{ matrix.binary }} main.cpp
+
+      - name: Package
         run: |
-          echo "name=$(echo '${{ github.repository }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Log in to ghcr.io
+          mkdir dist
+          cp ${{ matrix.binary }} dist/
+          cp README.md LICENSE dist/
+          tar -czf ${{ matrix.artifact_name }}.tar.gz -C dist .
+
+      - name: Upload to Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: v${{ github.event.inputs.version }}
+          files: ${{ matrix.artifact_name }}.tar.gz
+          generate_release_notes: true
+
+  publish-images:
+    name: Publish Docker images
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: docker/setup-qemu-action@v3
+      - uses: docker/setup-buildx-action@v3
+
+      - name: Login to GHCR
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Extract Docker metadata
-        id: meta
-        uses: docker/metadata-action@v5
+
+      - name: Parse tag
+        id: tag
+        run: echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
+
+      - name: Build & push backend
+        uses: docker/build-push-action@v6
         with:
-          images: ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}
+          context: .
+          file: .devops/Dockerfile.backend
+          platforms: linux/amd64,linux/arm64
+          push: true
           tags: |
-            type=raw,value=latest${{ matrix.tag_suffix }},enable={{is_default_branch}}
-            type=semver,pattern={{version}},suffix=${{ matrix.tag_suffix }}
-            type=semver,pattern={{major}}.{{minor}},suffix=${{ matrix.tag_suffix }}
-            type=ref,event=pr,suffix=${{ matrix.tag_suffix }}
-      - name: Free disk space
-        if: matrix.variant == 'cuda'
-        run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /usr/local/share/boost
-          df -h
-      - name: Build and push Docker image
+            ${{ env.IMAGE_PREFIX }}-backend:latest
+            ${{ env.IMAGE_PREFIX }}-backend:${{ steps.tag.outputs.VERSION }}
+          cache-from: type=gha,scope=backend
+          cache-to: type=gha,mode=max,scope=backend
+
+      - name: Build & push frontend
         uses: docker/build-push-action@v6
         with:
           context: .
-          file: ./${{ matrix.dockerfile }}
+          file: .devops/Dockerfile.frontend
+          platforms: linux/amd64,linux/arm64
           push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=${{ matrix.variant }}
-          cache-to: type=gha,mode=max,scope=${{ matrix.variant }}
-      - name: Image published
+          tags: |
+            ${{ env.IMAGE_PREFIX }}-frontend:latest
+            ${{ env.IMAGE_PREFIX }}-frontend:${{ steps.tag.outputs.VERSION }}
+          cache-from: type=gha,scope=frontend
+          cache-to: type=gha,mode=max,scope=frontend
+
+      - name: Build & push cpp
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: .devops/Dockerfile.cpp
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: |
+            ${{ env.IMAGE_PREFIX }}-cpp:latest
+            ${{ env.IMAGE_PREFIX }}-cpp:${{ steps.tag.outputs.VERSION }}
+          cache-from: type=gha,scope=cpp
+          cache-to: type=gha,mode=max,scope=cpp
+
+      - name: Create Release summary
         run: |
-          echo "[${{ matrix.variant }}] published:"
-          echo "  docker pull ${{ env.REGISTRY }}/${{ steps.image.outputs.name }}:latest${{ matrix.tag_suffix }}"
+          echo "## Docker images published" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "| Image | Tags |" >> $GITHUB_STEP_SUMMARY
+          echo "|-------|------|" >> $GITHUB_STEP_SUMMARY
+          echo "| \`quadtrix-backend\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| \`quadtrix-frontend\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| \`quadtrix-cpp\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml
new file mode 100644
index 0000000..c52ae09
--- /dev/null
+++ b/.github/workflows/pr-check.yml
@@ -0,0 +1,238 @@
+name: PR Checks
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  slash-command:
+    name: Parse /run-checks
+    if: |
+      github.event.issue.pull_request != null &&
+      contains(github.event.comment.body, '/run-checks')
+    runs-on: ubuntu-latest
+    outputs:
+      pr-sha: ${{ steps.get-sha.outputs.sha }}
+    steps:
+      - name: Check commenter permission
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { data } = await github.rest.repos.getCollaboratorPermissionLevel({
+              owner: context.repo.owner,
+              repo:  context.repo.repo,
+              username: context.actor,
+            });
+            if (!['admin', 'write'].includes(data.permission)) {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo:  context.repo.repo,
+                issue_number: context.issue.number,
+                body: `@${context.actor} Only maintainers can trigger checks.`,
+              });
+              core.setFailed('Unauthorized');
+            }
+
+      - name: React with rocket
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner,
+              repo:  context.repo.repo,
+              comment_id: ${{ github.event.comment.id }},
+              content: 'rocket',
+            });
+
+      - name: Get PR head SHA
+        id: get-sha
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo:  context.repo.repo,
+              pull_number: context.issue.number,
+            });
+            core.setOutput('sha', pr.head.sha);
+
+
+  lint:
+    name: Lint
+    needs: slash-command
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ needs.slash-command.outputs.pr-sha }}
+
+      - name: C++ format check
+        run: |
+          sudo apt-get install -y clang-format
+          find . -name "*.cpp" -o -name "*.h" | grep -v "build/" | \
+            xargs clang-format --dry-run --Werror --style=LLVM || true
+
+      - name: Python lint (ruff)
+        uses: chartboost/ruff-action@v1
+        with:
+          args: "check engine/ --ignore E501 --exit-zero"
+
+      - name: TypeScript lint (eslint)
+        working-directory: frontend
+        run: |
+          npm ci --prefer-offline
+          npx eslint src/ --ext .ts,.tsx --max-warnings 20 || true
+
+
+  build-cpp:
+    name: Build C++ (${{ matrix.os }})
+    needs: slash-command
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04, ubuntu-24.04, macos-14]
+        include:
+          - os: ubuntu-22.04
+            artifact: quadtrix-linux-x64
+          - os: ubuntu-24.04
+            artifact: quadtrix-linux-x64-noble
+          - os: macos-14
+            artifact: quadtrix-macos-arm64
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ needs.slash-command.outputs.pr-sha }}
+
+      - name: Install GCC (Linux)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y g++ ccache
+
+      - name: Cache ccache
+        uses: actions/cache@v4
+        with:
+          path: ~/.ccache
+          key: ccache-${{ matrix.os }}-${{ hashFiles('**/*.cpp', '**/*.h') }}
+          restore-keys: ccache-${{ matrix.os }}-
+
+      - name: Compile main.cpp
+        run: |
+          g++ -std=c++17 -O3 -march=native \
+            -I. -Iinclude \
+            -o quadtrix main.cpp
+
+      - name: Smoke test
+        run: ./quadtrix --help || true
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.artifact }}
+          path: quadtrix
+          retention-days: 7
+
+
+  validate-dockerfiles:
+    name: Validate Dockerfiles
+    needs: slash-command
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ needs.slash-command.outputs.pr-sha }}
+
+    
+      - name: Check required files exist
+        run: |
+          echo "Checking files referenced by Dockerfiles..."
+          files=(
+            "main.cpp"
+            "engine/main.py"
+            "requirements.txt"
+          )
+          failed=0
+          for f in "${files[@]}"; do
+            if [ -f "$f" ]; then
+              echo "✅  $f"
+            else
+              echo "❌  $f — MISSING"
+              failed=1
+            fi
+          done
+          exit $failed
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build check — Dockerfile.cpp (C++ engine)
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: .devops/Dockerfile.cpp
+          platforms: linux/amd64
+          push: false
+          cache-from: type=gha,scope=cpp
+          cache-to: type=gha,mode=max,scope=cpp
+
+
+      - name: Build check — Dockerfile (PyTorch CPU)
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: .devops/Dockerfile
+          platforms: linux/amd64
+          push: false
+          cache-from: type=gha,scope=cpu
+          cache-to: type=gha,mode=max,scope=cpu
+
+      - name: Skip CUDA build check
+        run: echo "CUDA build skipped on PR checks — run publish-docker workflow to build cuda image."
+
+
+  test-frontend:
+    name: Frontend Tests
+    needs: [slash-command, lint]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ needs.slash-command.outputs.pr-sha }}
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+          cache: npm
+          cache-dependency-path: frontend/package-lock.json
+
+      - name: Install
+        working-directory: frontend
+        run: npm ci --prefer-offline
+
+      - name: Type-check
+        working-directory: frontend
+        run: npx tsc --noEmit
+
+      - name: Build check
+        working-directory: frontend
+        run: npm run build
+
+
+  post-result:
+    name: Post result
+    needs: [slash-command, lint, build-cpp, validate-dockerfiles, test-frontend]
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - uses: actions/github-script@v7
+        with:
+          script: |
+            const jobs   = ${{ toJSON(needs) }};
+            const failed = Object.values(jobs).some(j => j.result === 'failure');
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo:  context.repo.repo,
+              issue_number: context.issue.number,
+              body: failed
+                ? ' Some checks failed — see Actions for details.'
+                : ' All checks passed!',
+            });
\ No newline at end of file
diff --git a/CUDA/main.cu b/CUDA/main.cu
new file mode 100644
index 0000000..4b24fec
--- /dev/null
+++ b/CUDA/main.cu
@@ -0,0 +1,2070 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string>
+#include <string_view>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "llmcpp/utils.h"
+
+#include "llmcpp/tokenizer.h"
+
+#include "llmcpp/dataloader.h"
+
+#include "llmcpp/rand.h"
+
+#include "llmcpp/schedulers.h"
+
+#include "llmcpp/sampler.h"
+
+#include "llmcpp/logger.h"
+
+#include "llmcpp/mfu.h"
+
+#include "llmcpp/outlier_detector.h"
+
+#include "llmcpp/cuda_common.h"
+
+#include "llmcpp/cuda_utils.cuh"
+
+#include "llmcpp/cublas_common.h"
+
+#include "llmcpp/encoder.cuh"
+
+#include "llmcpp/layernorm.cuh"
+
+#include "llmcpp/matmul.cuh"
+#ifdef ENABLE_CUDNN
+
+#include "llmcpp/cudnn_att.h"
+#else
+
+#include "llmcpp/attention.cuh"
+#endif
+
+#include "llmcpp/fused_classifier.cuh"
+
+#include "llmcpp/adamw.cuh"
+
+#include "llmcpp/global_norm.cuh"
+
+#include "llmcpp/zero.cuh"
+
+char filename_buffer[512];
+
+cudaDeviceProp deviceProp;
+cudaStream_t main_stream;
+
+constexpr const size_t IO_BUF_SIZE = 32 * 1024 * 1024;
+
+typedef struct
+{
+      int max_seq_len;
+      int vocab_size;
+      int padded_vocab_size;
+      int num_layers;
+      int num_heads;
+      int channels;
+} GPT2Config;
+
+constexpr const int NUM_PARAMETER_TENSORS = 16;
+typedef struct
+{
+      floatX *wte;
+      floatX *wpe;
+      floatX *ln1w;
+      floatX *ln1b;
+      floatX *qkvw;
+      floatX *qkvb;
+      floatX *attprojw;
+      floatX *attprojb;
+      floatX *ln2w;
+      floatX *ln2b;
+      floatX *fcw;
+      floatX *fcb;
+      floatX *fcprojw;
+      floatX *fcprojb;
+      floatX *lnfw;
+      floatX *lnfb;
+} ParameterTensors;
+static_assert(sizeof(ParameterTensors) == NUM_PARAMETER_TENSORS * sizeof(void *), "Inconsistent sizes!");
+
+void fill_in_parameter_sizes(size_t *param_sizes, size_t *param_sizeof, GPT2Config config)
+{
+      size_t Vp = config.padded_vocab_size;
+      size_t C = config.channels;
+      size_t maxT = config.max_seq_len;
+      size_t L = config.num_layers;
+      param_sizes[0] = Vp * C;
+      param_sizes[1] = maxT * C;
+      param_sizes[2] = L * C;
+      param_sizes[3] = L * C;
+      param_sizes[4] = L * (3 * C) * C;
+      param_sizes[5] = L * (3 * C);
+      param_sizes[6] = L * C * C;
+      param_sizes[7] = L * C;
+      param_sizes[8] = L * C;
+      param_sizes[9] = L * C;
+      param_sizes[10] = L * (4 * C) * C;
+      param_sizes[11] = L * (4 * C);
+      param_sizes[12] = L * C * (4 * C);
+      param_sizes[13] = L * C;
+      param_sizes[14] = C;
+      param_sizes[15] = C;
+
+      for (int i = 0; i < NUM_PARAMETER_TENSORS; i++)
+      {
+            param_sizeof[i] = sizeof(floatX);
+      }
+}
+
+void *malloc_and_point_parameters(ParameterTensors *params, size_t *param_elements, size_t *param_sizeof)
+{
+
+      size_t num_parameters_bytes = 0;
+      for (int i = 0; i < NUM_PARAMETER_TENSORS; i++)
+      {
+            num_parameters_bytes += param_elements[i] * param_sizeof[i];
+      }
+
+      void *params_memory;
+      cudaCheck(cudaMalloc((void **)&params_memory, num_parameters_bytes));
+
+      floatX **ptrs[] = {
+          &params->wte, &params->wpe, &params->ln1w, &params->ln1b, &params->qkvw, &params->qkvb,
+          &params->attprojw, &params->attprojb, &params->ln2w, &params->ln2b, &params->fcw, &params->fcb,
+          &params->fcprojw, &params->fcprojb, &params->lnfw, &params->lnfb};
+      char *params_memory_iterator = (char *)params_memory;
+      for (int i = 0; i < NUM_PARAMETER_TENSORS; i++)
+      {
+            *(ptrs[i]) = (floatX *)params_memory_iterator;
+            params_memory_iterator += param_elements[i] * param_sizeof[i];
+      }
+      return params_memory;
+}
+
+constexpr int NUM_ACTIVATION_TENSORS = 21;
+typedef struct
+{
+      floatX *encoded;
+      floatX *ln1;
+      float *ln1_mean;
+      float *ln1_rstd;
+      floatX *atty;
+
+#if ENABLE_CUDNN
+      float *att;
+#else
+      floatX *att;
+#endif
+
+      floatX *residual2;
+      floatX *ln2;
+      float *ln2_mean;
+      float *ln2_rstd;
+      floatX *fch;
+      floatX *fch_gelu;
+      floatX *residual3;
+      floatX *lnf;
+      float *lnf_mean;
+      float *lnf_rstd;
+      float *losses;
+
+      floatX *qkvr;
+
+      floatX *output;
+
+      floatX *scratch_bt4c;
+      floatX *scratch_btc;
+} ActivationTensors;
+
+struct TensorSpec
+{
+      void **ptr;
+      size_t size;
+      DType type;
+};
+
+#define TENSOR_SPEC(pointer, size) TensorSpec{(void **)(&pointer), (size), dtype_of(pointer)};
+
+void fill_in_activation_sizes(const ActivationTensors *data, TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS], size_t B, size_t T, GPT2Config config, int recompute)
+{
+      size_t Vp = config.padded_vocab_size;
+      size_t L = config.num_layers;
+      size_t NH = config.num_heads;
+      size_t C = config.channels;
+      tensors[0] = TENSOR_SPEC(data->encoded, B * T * C);
+
+      tensors[1] = TENSOR_SPEC(data->ln1, (recompute < 2) ? L * B * T * C : 0);
+      tensors[2] = TENSOR_SPEC(data->ln1_mean, L * B * T);
+      tensors[3] = TENSOR_SPEC(data->ln1_rstd, L * B * T);
+      tensors[4] = TENSOR_SPEC(data->atty, L * B * T * C);
+#ifdef ENABLE_CUDNN
+
+      tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T);
+#else
+      tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T * T);
+#endif
+      tensors[6] = TENSOR_SPEC(data->residual2, L * B * T * C);
+
+      tensors[7] = TENSOR_SPEC(data->ln2, (recompute < 2) ? L * B * T * C : 0);
+      tensors[8] = TENSOR_SPEC(data->ln2_mean, L * B * T);
+      tensors[9] = TENSOR_SPEC(data->ln2_rstd, L * B * T);
+      tensors[10] = TENSOR_SPEC(data->fch, L * B * T * 4 * C);
+
+      tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * 4 * C : B * T * 4 * C);
+      tensors[12] = TENSOR_SPEC(data->residual3, L * B * T * C);
+      tensors[13] = TENSOR_SPEC(data->lnf, B * T * C);
+      tensors[14] = TENSOR_SPEC(data->lnf_mean, B * T);
+      tensors[15] = TENSOR_SPEC(data->lnf_rstd, B * T);
+      tensors[16] = TENSOR_SPEC(data->losses, B * T);
+      tensors[17] = TENSOR_SPEC(data->qkvr, L * B * T * 3 * C);
+      tensors[18] = TENSOR_SPEC(data->output, B * T * max(3 * C, max(NH * T, Vp)));
+
+      tensors[19] = TENSOR_SPEC(data->scratch_bt4c, B * T * 4 * C);
+      tensors[20] = TENSOR_SPEC(data->scratch_btc, B * T * C);
+}
+
+void *malloc_and_point_activations(TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS])
+{
+      size_t bytes = 0;
+      for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++)
+      {
+            bytes += tensors[i].size * sizeof_dtype(tensors[i].type);
+      }
+
+      printf0("allocating %d MiB for activations\n", (int)round(bytes / (1024 * 1024)));
+
+      void *acts_memory;
+      cudaCheck(cudaMalloc((void **)&acts_memory, bytes));
+
+      cudaCheck(cudaMemset(acts_memory, 0, bytes));
+
+      char *acts_memory_iterator = (char *)acts_memory;
+      for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++)
+      {
+
+            if (tensors[i].size == 0)
+            {
+                  *(tensors[i].ptr) = NULL;
+            }
+            else
+            {
+                  *(tensors[i].ptr) = acts_memory_iterator;
+                  acts_memory_iterator += tensors[i].size * sizeof_dtype(tensors[i].type);
+            }
+      }
+      return acts_memory;
+}
+
+typedef struct
+{
+      GPT2Config config;
+
+      ParameterTensors params;
+      size_t param_elements[NUM_PARAMETER_TENSORS];
+      size_t param_sizeof[NUM_PARAMETER_TENSORS];
+      void *params_memory;
+      size_t num_parameters;
+      size_t num_parameters_bytes;
+
+      ParameterTensors grads;
+      void *grads_memory;
+
+      float *m_memory;
+      float *v_memory;
+      float *master_weights;
+
+      ActivationTensors acts;
+      TensorSpec acts_specs[NUM_ACTIVATION_TENSORS];
+      void *acts_memory;
+
+      int batch_size;
+      int seq_len;
+      int *inputs;
+      int *targets;
+      float mean_loss;
+      float *accumulated_mean_loss;
+      float *cpu_losses;
+      unsigned long long rng_state;
+      unsigned long long rng_state_last_update;
+      int use_master_weights;
+      bool init_state;
+      int gelu_fusion;
+      int recompute;
+
+      int *workload_indices;
+      int4 *bucket_info;
+} GPT2;
+
+void gpt2_init_common(GPT2 *model)
+{
+
+      model->acts_memory = NULL;
+      model->inputs = NULL;
+      model->targets = NULL;
+      model->accumulated_mean_loss = NULL;
+      model->cpu_losses = NULL;
+
+      model->batch_size = 0;
+      model->seq_len = 0;
+      model->mean_loss = -1.0f;
+      model->params_memory = NULL;
+
+      model->grads_memory = NULL;
+      model->workload_indices = NULL;
+      model->bucket_info = NULL;
+
+      model->m_memory = NULL;
+      model->v_memory = NULL;
+      model->master_weights = NULL;
+
+      model->rng_state = 13371337 + multi_gpu_config.process_rank;
+      model->use_master_weights = 1;
+      model->init_state = true;
+      model->recompute = 1;
+      model->gelu_fusion = 0;
+}
+
+void gpt2_allocate_weights(GPT2 *model)
+{
+
+      fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
+      model->num_parameters = 0;
+      model->num_parameters_bytes = 0;
+      for (int i = 0; i < NUM_PARAMETER_TENSORS; i++)
+      {
+            model->num_parameters += model->param_elements[i];
+            model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
+      }
+
+      assert(model->params_memory == nullptr);
+      model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
+}
+
+void gpt2_allocate_state(GPT2 *model, int B, int T)
+{
+      printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
+      assert(model->grads_memory == nullptr);
+      model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);
+
+      model->batch_size = B;
+      model->seq_len = T;
+
+      fill_in_activation_sizes(&model->acts, model->acts_specs, B, T, model->config, model->recompute);
+      model->acts_memory = malloc_and_point_activations(model->acts_specs);
+
+      cudaCheck(cudaMalloc((void **)&model->inputs, B * T * sizeof(int)));
+      cudaCheck(cudaMalloc((void **)&model->targets, B * T * sizeof(int)));
+      cudaCheck(cudaMalloc(((void **)&model->accumulated_mean_loss), sizeof(float)));
+      cudaCheck(cudaMallocHost((void **)&model->cpu_losses, B * T * sizeof(float)));
+
+      size_t num_c_groups = CEIL_DIV(model->config.channels, (WARP_SIZE * x128::size));
+      assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL << 31ULL));
+      model->workload_indices = (int *)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
+      model->bucket_info = (int4 *)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);
+
+      int memory_status = 0;
+
+      size_t shard_num_parameters = multi_gpu_config.shard_num_parameters;
+      printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
+      printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
+      assert(model->m_memory == nullptr);
+      assert(model->v_memory == nullptr);
+      memory_status |= cudaMallocConditionallyManaged((void **)&model->m_memory, shard_num_parameters * sizeof(float));
+      memory_status |= cudaMallocConditionallyManaged((void **)&model->v_memory, shard_num_parameters * sizeof(float));
+
+      if (model->use_master_weights == 1)
+      {
+            assert(model->master_weights == nullptr);
+            printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
+            memory_status |= cudaMallocConditionallyManaged((void **)&model->master_weights, shard_num_parameters * sizeof(float));
+      }
+
+      int reduced_memory_status = (int)multi_gpu_cpu_float_sum((float)memory_status, &multi_gpu_config);
+      if (reduced_memory_status >= 1)
+      {
+            printf0("WARNING: Fell back to cudaMallocManaged when initializing m,v,master_weights on %d GPUs\n", reduced_memory_status);
+            printf0("         Prevents an OOM, but code may run much slower due to device <-> host memory movement\n");
+      }
+
+      size_t free, total;
+      cudaCheck(cudaMemGetInfo(&free, &total));
+      printf0("device memory usage: %zd MiB / %zd MiB\n", (total - free) / 1024 / 1024, total / 1024 / 1024);
+
+      size_t bytes_per_sequence = 0;
+      for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++)
+      {
+            bytes_per_sequence += model->acts_specs[i].size * sizeof_dtype(model->acts_specs[i].type) / B;
+      }
+      printf0("memory per sequence: %zu MiB\n", bytes_per_sequence / 1024 / 1024);
+      printf0(" -> estimated maximum batch size: %zu\n", B + free / bytes_per_sequence);
+}
+
+void gpt2_write_to_checkpoint(GPT2 *model, const char *checkpoint_path)
+{
+
+      printf0("Writing model to %s\n", checkpoint_path);
+      FILE *model_file = fopenCheck(checkpoint_path, "wb");
+
+      int model_header[256];
+      memset(model_header, 0, sizeof(model_header));
+      model_header[0] = 20240326;
+      assert(PRECISION_MODE == PRECISION_FP32 || PRECISION_MODE == PRECISION_BF16);
+      model_header[1] = PRECISION_MODE == PRECISION_FP32 ? 3 : 5;
+      model_header[2] = model->config.max_seq_len;
+      model_header[3] = model->config.vocab_size;
+      model_header[4] = model->config.num_layers;
+      model_header[5] = model->config.num_heads;
+      model_header[6] = model->config.channels;
+      model_header[7] = model->config.padded_vocab_size;
+      fwriteCheck(model_header, sizeof(int), 256, model_file);
+
+      device_to_file(model_file, model->params_memory, model->num_parameters_bytes,
+                     IO_BUF_SIZE, main_stream);
+
+      fcloseCheck(model_file);
+}
+
+void gpt2_build_from_checkpoint(GPT2 *model, const char *checkpoint_path, bool weight_init = true)
+{
+
+      if (PRECISION_MODE == PRECISION_FP16)
+      {
+
+            fprintf(stderr, "build_from_checkpoint() does not support fp16 right now.\n");
+            exit(EXIT_FAILURE);
+      }
+
+      FILE *model_file = fopenCheck(checkpoint_path, "rb");
+      int model_header[256];
+      freadCheck(model_header, sizeof(int), 256, model_file);
+      if (model_header[0] != 20240326)
+      {
+            printf("Bad magic model file\n");
+            exit(EXIT_FAILURE);
+      }
+      int version = model_header[1];
+      if (!(version == 3 || version == 5))
+      {
+
+            fprintf(stderr, "Bad version in model file\n");
+            fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n");
+            exit(EXIT_FAILURE);
+      }
+
+      if (weight_init)
+      {
+            if (PRECISION_MODE == PRECISION_BF16 && version != 5)
+            {
+                  fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
+                  fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
+                  exit(EXIT_FAILURE);
+            }
+            if (PRECISION_MODE == PRECISION_FP32 && version != 3)
+            {
+                  fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
+                  fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_gpt2cu PRECISION=FP32`\n");
+                  fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
+                  exit(EXIT_FAILURE);
+            }
+      }
+
+      model->config.max_seq_len = model_header[2];
+      model->config.vocab_size = model_header[3];
+      model->config.num_layers = model_header[4];
+      model->config.num_heads = model_header[5];
+      model->config.channels = model_header[6];
+      model->config.padded_vocab_size = model_header[7];
+
+      gpt2_allocate_weights(model);
+
+      if (weight_init)
+      {
+            assert(model->params_memory != NULL);
+            file_to_device(model->params_memory, model_file, model->num_parameters_bytes, IO_BUF_SIZE, main_stream);
+      }
+      fcloseCheck(model_file);
+
+      cudaCheck(cudaDeviceSynchronize());
+}
+
+void gpt2_set_hyperparameters(GPT2Config *config, const char *depth_str)
+{
+      int depth = atoi(depth_str);
+      assert(depth > 0);
+      int channels, num_heads;
+      if (depth == 6)
+      {
+            channels = 384;
+            num_heads = 6;
+      }
+      else if (depth == 12)
+      {
+            channels = 768;
+            num_heads = 12;
+      }
+      else if (depth == 24)
+      {
+            channels = 1024;
+            num_heads = 16;
+      }
+      else if (depth == 36)
+      {
+            channels = 1280;
+            num_heads = 20;
+      }
+      else if (depth == 48)
+      {
+            channels = 1600;
+            num_heads = 25;
+      }
+      else if (depth == 60)
+      {
+            channels = 1920;
+            num_heads = 30;
+      }
+      else if (depth == 72)
+      {
+            channels = 2880;
+            num_heads = 30;
+      }
+      else if (depth == 84)
+      {
+            channels = 3456;
+            num_heads = 36;
+      }
+      else
+      {
+            fprintf(stderr, "Unsupported GPT-2 depth: %d\n", depth);
+            exit(EXIT_FAILURE);
+      }
+      config->num_layers = depth;
+      config->channels = channels;
+      config->num_heads = num_heads;
+      config->max_seq_len = 1024;
+}
+
+void gpt3_set_hyperparameters(GPT2Config *config, const char *channels_str)
+{
+
+      int channels = atoi(channels_str);
+      assert(channels > 0);
+      int depth, head_size;
+      if (channels == 384)
+      {
+            depth = 6;
+            head_size = 64;
+      }
+      else if (channels == 768)
+      {
+            depth = 12;
+            head_size = 64;
+      }
+      else if (channels == 1024)
+      {
+            depth = 24;
+            head_size = 64;
+      }
+      else if (channels == 1536)
+      {
+            depth = 24;
+            head_size = 96;
+      }
+      else if (channels == 2048)
+      {
+            depth = 24;
+            head_size = 128;
+      }
+      else if (channels == 2560)
+      {
+            depth = 32;
+            head_size = 80;
+      }
+      else if (channels == 4096)
+      {
+            depth = 32;
+            head_size = 128;
+      }
+      else if (channels == 5140)
+      {
+            depth = 40;
+            head_size = 128;
+      }
+      else if (channels == 12288)
+      {
+            depth = 96;
+            head_size = 128;
+      }
+      else
+      {
+            fprintf(stderr, "Unsupported GPT-3 channels: %d\n", channels);
+            exit(EXIT_FAILURE);
+      }
+      assert(channels % head_size == 0);
+      config->num_layers = depth;
+      config->channels = channels;
+      config->num_heads = channels / head_size;
+      config->max_seq_len = 2048;
+}
+
+void gpt_build_from_descriptor(GPT2 *model, const char *descriptor)
+{
+
+      assert(descriptor != NULL);
+      size_t len = strlen(descriptor);
+      if (len > 1 && descriptor[0] == 'd')
+      {
+            gpt2_set_hyperparameters(&model->config, descriptor + 1);
+      }
+      else if (len > 6 && strncmp(descriptor, "gpt2:d", 6) == 0)
+      {
+            gpt2_set_hyperparameters(&model->config, descriptor + 6);
+      }
+      else if (len > 6 && strncmp(descriptor, "gpt3:c", 6) == 0)
+      {
+            gpt3_set_hyperparameters(&model->config, descriptor + 6);
+      }
+      else
+      {
+            fprintf(stderr, "Unsupported model descriptor: %s\n", descriptor);
+            exit(EXIT_FAILURE);
+      }
+
+      model->config.vocab_size = 50257;
+      model->config.padded_vocab_size = 50304;
+
+      gpt2_allocate_weights(model);
+
+      mt19937_state init_rng;
+      manual_seed(&init_rng, 42);
+      floatX *params_memory_cpu = (floatX *)mallocCheck(model->num_parameters_bytes);
+      memset(params_memory_cpu, 0, model->num_parameters_bytes);
+
+      float residual_scale = 1.0f / sqrtf(2.0f * model->config.num_layers);
+
+      size_t L = model->config.num_layers;
+      size_t offset = 0;
+      for (int l = 0; l < L; l++)
+      {
+            offset = 0;
+            for (int i = 0; i < NUM_PARAMETER_TENSORS; i++)
+            {
+
+                  if (l == 0 && (i == 2 || i == 8 || i == 14))
+                  {
+                        for (size_t j = 0; j < model->param_elements[i]; j++)
+                        {
+                              params_memory_cpu[offset + j] = 1.0f;
+                        }
+                  }
+
+                  if ((l == 0 && (i == 0 || i == 1)) || i == 4 || i == 6 || i == 10 || i == 12)
+                  {
+                        size_t n = model->param_elements[i];
+                        size_t layer_offset = 0;
+                        if (i == 0)
+                        {
+
+                              n = model->config.vocab_size * model->config.channels;
+                        }
+                        if (i == 4 || i == 6 || i == 10 || i == 12)
+                        {
+
+                              assert(n % L == 0);
+                              n = n / L;
+                              layer_offset = l * n;
+                        }
+
+                        float scale = (i == 6 || i == 12) ? 0.02f * residual_scale : 0.02f;
+
+                        float *fp32_buffer = (float *)mallocCheck(n * sizeof(float));
+                        normal_(fp32_buffer, n, 0.0f, scale, &init_rng);
+                        for (size_t j = 0; j < n; j++)
+                        {
+                              params_memory_cpu[offset + layer_offset + j] = (floatX)fp32_buffer[j];
+                        }
+                        free(fp32_buffer);
+                  }
+                  offset += model->param_elements[i];
+            }
+      }
+
+      cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice));
+      free(params_memory_cpu);
+}
+
+void gpt2_forward(GPT2 *model, const int *inputs, size_t B, size_t T)
+{
+      NVTX_RANGE_FN();
+
+      if (model->params_memory == NULL)
+      {
+            printf("Error: model was not initialized properly.\n");
+            exit(EXIT_FAILURE);
+      }
+
+      const size_t V = model->config.vocab_size;
+      const size_t Vp = model->config.padded_vocab_size;
+      const size_t L = model->config.num_layers;
+      const size_t NH = model->config.num_heads;
+      const size_t C = model->config.channels;
+
+      if (B > model->batch_size || T > model->seq_len)
+      {
+            printf("Model: B=%d T=%d, Desired: B=%d T=%d\n", model->batch_size, model->seq_len, (int)B, (int)T);
+            exit(EXIT_FAILURE);
+      }
+
+      cudaCheck(cudaMemcpy(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice));
+
+      tokenCheck(inputs, B * T, V);
+
+      ParameterTensors params = model->params;
+      ActivationTensors acts = model->acts;
+      encoder_forward(acts.encoded, model->inputs, params.wte, params.wpe, B, T, C, main_stream);
+
+      layernorm_forward((model->recompute < 2) ? acts.ln1 : acts.lnf, acts.ln1_mean, acts.ln1_rstd, acts.encoded, params.ln1w, params.ln1b, B, T, C, main_stream);
+
+      for (int l = 0; l < L; l++)
+      {
+            NvtxRange layer_range("Layer", l);
+
+            floatX *residual = l == 0 ? acts.encoded : acts.residual3 + (l - 1) * B * T * C;
+
+            floatX *l_qkvw = params.qkvw + l * 3 * C * C;
+            floatX *l_qkvb = params.qkvb + l * 3 * C;
+            floatX *l_attprojw = params.attprojw + l * C * C;
+            floatX *l_attprojb = params.attprojb + l * C;
+            floatX *l_ln2w = params.ln2w + l * C;
+            floatX *l_ln2b = params.ln2b + l * C;
+            floatX *l_fcw = params.fcw + l * 4 * C * C;
+            floatX *l_fcb = params.fcb + l * 4 * C;
+            floatX *l_fcprojw = params.fcprojw + l * C * 4 * C;
+            floatX *l_fcprojb = params.fcprojb + l * C;
+
+            floatX *l_ln1 = (model->recompute < 2) ? acts.ln1 + l * B * T * C : acts.lnf;
+            floatX *l_qkvr = acts.qkvr + l * B * T * 3 * C;
+            floatX *l_atty = acts.atty + l * B * T * C;
+            floatX *l_residual2 = acts.residual2 + l * B * T * C;
+            floatX *l_ln2 = (model->recompute < 2) ? acts.ln2 + l * B * T * C : acts.lnf;
+            float *l_ln2_mean = acts.ln2_mean + l * B * T;
+            float *l_ln2_rstd = acts.ln2_rstd + l * B * T;
+            floatX *l_fch = acts.fch + l * B * T * 4 * C;
+
+            floatX *l_fch_gelu = (model->recompute < 1) ? acts.fch_gelu + l * B * T * 4 * C : acts.fch_gelu;
+            floatX *l_residual3 = acts.residual3 + l * B * T * C;
+            floatX *scratch = (floatX *)acts.output;
+
+#ifdef ENABLE_CUDNN
+            float *l_att = (float *)acts.att + l * B * NH * T;
+            matmul_forward_cublaslt(l_qkvr, l_ln1, l_qkvw, l_qkvb, B, T, C, 3 * C, main_stream);
+            attention_forward_cudnn(l_atty, (float *)l_att, l_qkvr, B, T, NH, C, main_stream);
+#else
+            floatX *l_att = acts.att + l * B * NH * T * T;
+            if (T != model->seq_len)
+            {
+                  cudaCheck(cudaMemset(l_att, 0, B * NH * T * T * sizeof(floatX)));
+            }
+
+            matmul_forward_cublaslt(scratch, l_ln1, l_qkvw, l_qkvb, B, T, C, 3 * C, main_stream);
+            attention_forward(l_atty, l_qkvr, l_att, scratch, B, T, C, NH, main_stream);
+#endif
+
+            matmul_forward_cublaslt(scratch, l_atty, l_attprojw, l_attprojb, B, T, C, C, main_stream);
+            fused_residual_forward5(l_residual2, l_ln2, l_ln2_mean, l_ln2_rstd, residual, scratch, l_ln2w, l_ln2b, B * T, C, main_stream);
+            matmul_forward_cublaslt(l_fch_gelu, l_ln2, l_fcw, l_fcb, B, T, C, 4 * C, main_stream, l_fch, model->gelu_fusion);
+            matmul_forward_cublaslt(scratch, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4 * C, C, main_stream);
+
+            if (l + 1 != L)
+            {
+                  floatX *l_ln1 = (model->recompute < 2) ? acts.ln1 + (l + 1) * B * T * C : acts.lnf;
+                  float *l_ln1_mean = acts.ln1_mean + (l + 1) * B * T;
+                  float *l_ln1_rstd = acts.ln1_rstd + (l + 1) * B * T;
+                  const floatX *l_ln1w = params.ln1w + (l + 1) * C;
+                  const floatX *l_ln1b = params.ln1b + (l + 1) * C;
+                  fused_residual_forward5(l_residual3, l_ln1, l_ln1_mean, l_ln1_rstd, l_residual2, scratch, l_ln1w, l_ln1b,
+                                          B * T, C, main_stream);
+            }
+            else
+            {
+                  fused_residual_forward5(l_residual3, acts.lnf, acts.lnf_mean, acts.lnf_rstd, l_residual2, scratch,
+                                          params.lnfw, params.lnfb,
+                                          B * T, C, main_stream);
+            }
+      }
+
+      matmul_forward_cublaslt(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp, main_stream);
+      cudaCheck(cudaDeviceSynchronize());
+}
+
+float gpt2_validate(GPT2 *model, const int *inputs, const int *targets, size_t B, size_t T)
+{
+      assert(targets != NULL);
+
+      gpt2_forward(model, inputs, B, T);
+
+      const size_t V = model->config.vocab_size;
+      const size_t Vp = model->config.padded_vocab_size;
+
+      NvtxRange classifier_and_loss_range("classifier_and_loss");
+      ActivationTensors acts = model->acts;
+      float mean_loss = 0.0f;
+
+      const float dloss = 1.0f / (B * T);
+
+      cudaCheck(cudaMemset(acts.losses, 0, B * T * sizeof(float)));
+      cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
+      tokenCheck(targets, B * T, V);
+      fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp, False, main_stream);
+      cudaCheck(cudaMemcpy(model->cpu_losses, acts.losses, B * T * sizeof(float), cudaMemcpyDeviceToHost));
+      for (int i = 0; i < B * T; i++)
+      {
+            mean_loss += model->cpu_losses[i];
+      }
+      mean_loss /= B * T;
+      cudaCheck(cudaDeviceSynchronize());
+      return mean_loss;
+}
+
+void gpt2_backward_and_reduce(GPT2 *model, int *inputs, const int *targets, int grad_accum_steps, int micro_step)
+{
+      if (model->grads_memory == nullptr)
+      {
+            fprintf(stderr, "Need to allocate gradients before backward");
+            exit(EXIT_FAILURE);
+      }
+      NVTX_RANGE_FN();
+      bool last_step = micro_step == grad_accum_steps - 1;
+
+      if (micro_step == 0)
+      {
+
+            cudaCheck(cudaMemsetAsync(model->acts.losses, 0, model->batch_size * model->seq_len * sizeof(float), main_stream));
+            cudaCheck(cudaMemsetAsync(model->grads_memory, 0, model->num_parameters * sizeof(floatX), main_stream));
+      }
+
+      const size_t B = model->batch_size;
+      const size_t T = model->seq_len;
+      const size_t V = model->config.vocab_size;
+      const size_t Vp = model->config.padded_vocab_size;
+      const size_t L = model->config.num_layers;
+      const size_t NH = model->config.num_heads;
+      const size_t C = model->config.channels;
+
+      ParameterTensors params = model->params;
+      ParameterTensors grads = model->grads;
+      ActivationTensors acts = model->acts;
+
+      NvtxRange classifier_and_loss_range("classifier_and_loss");
+      const float dloss = 1.0f / (float)(B * T * grad_accum_steps);
+      cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
+      tokenCheck(targets, B * T, V);
+      fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp, True, main_stream);
+
+      floatX *dresidual = (floatX *)model->acts.scratch_btc;
+      cudaCheck(cudaMemset(dresidual, 0, B * T * C * sizeof(floatX)));
+
+      float *scratchF = (float *)acts.output;
+      floatX *scratchX = (floatX *)acts.output;
+
+      matmul_backward(model->acts.scratch_bt4c, grads.wte, NULL, acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp, main_stream);
+
+      floatX *residual = acts.residual3 + (L - 1) * B * T * C;
+      layernorm_backward(dresidual, grads.lnfw, grads.lnfb, scratchF, model->acts.scratch_bt4c, residual, params.lnfw, acts.lnf_mean, acts.lnf_rstd, B, T, C, main_stream);
+
+      floatX *dl_btc = residual;
+
+      for (int l = L - 1; l >= 0; l--)
+      {
+            NvtxRange layer_range("Layer", l);
+
+            residual = l == 0 ? acts.encoded : acts.residual3 + (l - 1) * B * T * C;
+
+            floatX *l_ln1w = params.ln1w + l * C;
+            floatX *l_ln1b = params.ln1b + l * C;
+            floatX *l_qkvw = params.qkvw + l * 3 * C * C;
+            floatX *l_attprojw = params.attprojw + l * C * C;
+            floatX *l_ln2w = params.ln2w + l * C;
+            floatX *l_ln2b = params.ln2b + l * C;
+            floatX *l_fcw = params.fcw + l * 4 * C * C;
+            floatX *l_fcprojw = params.fcprojw + l * C * 4 * C;
+
+            floatX *dl_ln1w = grads.ln1w + l * C;
+            floatX *dl_ln1b = grads.ln1b + l * C;
+            floatX *dl_qkvw = grads.qkvw + l * 3 * C * C;
+            floatX *dl_qkvb = grads.qkvb + l * 3 * C;
+            floatX *dl_attprojw = grads.attprojw + l * C * C;
+            floatX *dl_attprojb = grads.attprojb + l * C;
+            floatX *dl_ln2w = grads.ln2w + l * C;
+            floatX *dl_ln2b = grads.ln2b + l * C;
+            floatX *dl_fcw = grads.fcw + l * 4 * C * C;
+            floatX *dl_fcb = grads.fcb + l * 4 * C;
+            floatX *dl_fcprojw = grads.fcprojw + l * C * 4 * C;
+            floatX *dl_fcprojb = grads.fcprojb + l * C;
+
+            floatX *l_ln1 = (model->recompute < 2) ? acts.ln1 + l * B * T * C : acts.lnf;
+            float *l_ln1_mean = acts.ln1_mean + l * B * T;
+            float *l_ln1_rstd = acts.ln1_rstd + l * B * T;
+            floatX *l_qkvr = acts.qkvr + l * B * T * 3 * C;
+            floatX *l_atty = acts.atty + l * B * T * C;
+            floatX *l_residual2 = acts.residual2 + l * B * T * C;
+            floatX *l_ln2 = (model->recompute < 2) ? acts.ln2 + l * B * T * C : acts.lnf;
+            float *l_ln2_mean = acts.ln2_mean + l * B * T;
+            float *l_ln2_rstd = acts.ln2_rstd + l * B * T;
+            floatX *l_fch_pre_gelu = acts.fch + l * B * T * 4 * C;
+            floatX *l_fch_gelu = (model->recompute < 1) ? acts.fch_gelu + l * B * T * 4 * C : acts.fch_gelu;
+
+            floatX *dl_bt4c = (floatX *)model->acts.scratch_bt4c;
+
+            if (model->recompute >= 1)
+            {
+
+                  gelu_forward(l_fch_gelu, l_fch_pre_gelu, B * T * 4 * C, main_stream);
+            }
+            matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4 * C, C, main_stream, l_fch_pre_gelu, model->gelu_fusion);
+            if (model->recompute >= 2)
+            {
+
+                  layernorm_forward(l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C, main_stream);
+            }
+            matmul_backward(dl_btc, dl_fcw, dl_fcb, dl_bt4c, l_ln2, l_fcw, scratchF, B, T, C, 4 * C, main_stream);
+
+            layernorm_backward(dresidual, dl_ln2w, dl_ln2b, scratchF, dl_btc, l_residual2, l_ln2w, l_ln2_mean, l_ln2_rstd, B, T, C, main_stream);
+            matmul_backward(dl_btc, dl_attprojw, dl_attprojb, dresidual, l_atty, l_attprojw, scratchF, B, T, C, C, main_stream);
+
+#ifdef ENABLE_CUDNN
+            float *l_att = (float *)acts.att + l * B * NH * T;
+            attention_backward_cudnn(dl_bt4c, dl_btc, l_qkvr, l_atty, (float *)l_att, B, T, NH, C, main_stream);
+#else
+            floatX *l_att = acts.att + l * B * NH * T * T;
+
+            floatX *buffer_a = l_atty;
+            floatX *buffer_b = l_fch_pre_gelu;
+            attention_backward(dl_bt4c, buffer_b, scratchX, buffer_a, dl_btc, l_qkvr, l_att, B, T, C, NH, main_stream);
+#endif
+            if (model->recompute >= 2)
+            {
+                  layernorm_forward(l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C, main_stream);
+            }
+
+            matmul_backward(dl_btc, dl_qkvw, dl_qkvb, dl_bt4c, l_ln1, l_qkvw, scratchF, B, T, C, 3 * C, main_stream);
+
+            layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C, main_stream);
+
+            if (last_step)
+            {
+                  floatX *const pointers[] = {
+                      dl_ln1w, dl_ln1b,
+                      dl_qkvw, dl_qkvb,
+                      dl_attprojw, dl_attprojb,
+                      dl_ln2w, dl_ln2b,
+                      dl_fcw, dl_fcb,
+                      dl_fcprojw, dl_fcprojb};
+                  const size_t nelem[] = {
+                      C, C,
+                      3 * C * C, 3 * C,
+                      C * C, C,
+                      C, C,
+                      4 * C * C, 4 * C,
+                      C * 4 * C, C};
+                  multi_gpu_async_reduce_gradient(pointers, nelem, &multi_gpu_config, main_stream);
+            }
+      }
+      encoder_backward(grads.wte, grads.wpe, scratchX, model->workload_indices, model->bucket_info,
+                       dresidual, model->inputs, inputs, B, T, C, random_u32(&model->rng_state), main_stream);
+
+      if (last_step)
+      {
+
+            global_sum_deterministic(model->accumulated_mean_loss, acts.losses, B * T, main_stream);
+
+#if MULTI_GPU
+            ncclCheck(ncclAllReduce(model->accumulated_mean_loss, model->accumulated_mean_loss, sizeof(float), ncclFloat, ncclAvg, multi_gpu_config.nccl_comm, main_stream));
+#endif
+            cudaCheck(cudaMemcpyAsync(&model->mean_loss, model->accumulated_mean_loss, sizeof(float), cudaMemcpyDeviceToHost, main_stream));
+
+            floatX *const pointers[] = {grads.wte, grads.wpe, grads.lnfw, grads.lnfb};
+            const size_t nelem[] = {Vp * C, T * C, C, C};
+            multi_gpu_async_reduce_gradient(pointers, nelem, &multi_gpu_config, main_stream);
+      }
+
+      cudaCheck(cudaDeviceSynchronize());
+      if (last_step)
+      {
+            model->mean_loss /= B * T * grad_accum_steps;
+      }
+      else
+      {
+            model->mean_loss = -1.f;
+      }
+}
+
+ShardInfo gpt2_get_tensor_at_layer(const GPT2 *model, int layer_id, int param_tensor_id)
+{
+
+      ptrdiff_t offset = 0;
+      for (int i = 0; i < param_tensor_id; i++)
+      {
+            offset += (ptrdiff_t)model->param_elements[i];
+      }
+      size_t size = model->param_elements[param_tensor_id];
+
+      if (2 <= param_tensor_id && param_tensor_id <= 13)
+      {
+            size /= model->config.num_layers;
+            offset += (ptrdiff_t)(layer_id * size);
+      }
+      return {offset, size};
+}
+
+float gpt2_calculate_grad_norm(GPT2 *model, MultiGpuConfig *multi_gpu_config)
+{
+      NVTX_RANGE_FN();
+      floatX *grads_memory = (floatX *)model->grads_memory;
+
+      float *grad_norm_squared = (float *)model->acts.output;
+      float grad_norm_squared_cpu = 0.0f;
+
+      int num_slices[2] = {1, model->config.num_layers};
+      int max_num_block_sums = get_max_num_block_sums(num_slices, 2);
+      if (multi_gpu_config->zero_stage == 1)
+      {
+
+            for (int i = 0; i < NUM_PARAMETER_TENSORS; i++)
+            {
+                  ShardInfo tensor = gpt2_get_tensor_at_layer(model, 0, i);
+                  ShardInfo shard = multi_gpu_get_shard_offset(tensor.size, multi_gpu_config, 1);
+                  ptrdiff_t offset = tensor.offset + shard.offset;
+                  bool is_first_pass = (i == 0);
+                  if ((i < 2 || i > 13))
+                  {
+                        global_norm_squared(grad_norm_squared, grads_memory + offset, shard.size, 0, 1,
+                                            max_num_block_sums, is_first_pass, main_stream);
+                  }
+                  else
+                  {
+                        global_norm_squared(grad_norm_squared, grads_memory + offset, shard.size, tensor.size, model->config.num_layers,
+                                            max_num_block_sums, is_first_pass, main_stream);
+                  }
+            }
+            global_sum_deterministic(grad_norm_squared, grad_norm_squared, max_num_block_sums, main_stream);
+#if MULTI_GPU
+
+            ncclCheck(ncclAllReduce(grad_norm_squared, grad_norm_squared, sizeof(float), ncclFloat, ncclSum, multi_gpu_config->nccl_comm, main_stream));
+#endif
+      }
+      else
+      {
+
+            global_norm_squared(grad_norm_squared, grads_memory, model->num_parameters, 0, 1, max_num_block_sums, true, main_stream);
+            global_sum_deterministic(grad_norm_squared, grad_norm_squared, max_num_block_sums, main_stream);
+      }
+      cudaCheck(cudaMemcpy(&grad_norm_squared_cpu, grad_norm_squared, sizeof(float), cudaMemcpyDeviceToHost));
+      float grad_norm_cpu = sqrtf(grad_norm_squared_cpu);
+      return grad_norm_cpu;
+}
+
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_scale, int t,
+                 MultiGpuConfig *multi_gpu_config, bool init_from_master_only = false)
+{
+
+      NVTX_RANGE_FN();
+      if (model->grads_memory == nullptr || model->m_memory == nullptr || model->v_memory == nullptr)
+      {
+            fprintf(stderr, "Need to allocate optimizer state before update");
+            exit(EXIT_FAILURE);
+      }
+
+      bool init_state = model->init_state;
+      if (init_state)
+      {
+            model->init_state = false;
+            NvtxRange rng("InitOpt");
+            cudaCheck(cudaMemset(model->m_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float)));
+            cudaCheck(cudaMemset(model->v_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float)));
+      }
+
+      model->rng_state_last_update = model->rng_state;
+
+      for (int i = 0; i < NUM_PARAMETER_TENSORS; i++)
+      {
+
+            unsigned int seed = random_u32(&model->rng_state);
+
+            int num_layers = model->config.num_layers;
+            if ((i < 2 || i > 13))
+            {
+                  num_layers = 1;
+            }
+
+            ShardInfo tensor = gpt2_get_tensor_at_layer(model, 0, i);
+            ShardInfo shard = multi_gpu_get_shard_offset(tensor.size, multi_gpu_config, 1);
+            ptrdiff_t local_offset_full = tensor.offset + shard.offset;
+            ptrdiff_t local_offset_partial = tensor.offset / multi_gpu_config->num_processes;
+
+            float wd = (i == 0 || i == 1 || i == 4 || i == 6 || i == 10 || i == 12) ? weight_decay : 0.0f;
+            floatX *param_ptr = (floatX *)model->params_memory + local_offset_full;
+            floatX *grad_ptr = (floatX *)model->grads_memory + local_offset_full;
+
+            ptrdiff_t opt_state_offset = multi_gpu_config->zero_stage < 1 ? local_offset_full : local_offset_partial;
+            float *m_ptr = model->m_memory + opt_state_offset;
+            float *v_ptr = model->v_memory + opt_state_offset;
+            float *master_ptr = nullptr;
+            if (model->master_weights != nullptr)
+            {
+                  master_ptr = model->master_weights + opt_state_offset;
+            }
+            if (init_state && model->master_weights != nullptr)
+            {
+                  size_t grid_size = CEIL_DIV(shard.size, 512);
+                  copy_and_cast_kernel<<<dim3(grid_size, num_layers), 512, 0, main_stream>>>(master_ptr, param_ptr, shard.size,
+                                                                                             shard.size, tensor.size);
+                  cudaCheck(cudaGetLastError());
+            }
+
+            if (init_from_master_only)
+            {
+
+                  init_from_master(param_ptr, master_ptr, shard.size, tensor.size, shard.size, num_layers, seed, main_stream);
+            }
+            else
+            {
+
+                  adamw_update(param_ptr, master_ptr, grad_ptr,
+                               m_ptr, v_ptr,
+                               shard.size, tensor.size, tensor.size, shard.size, num_layers,
+                               learning_rate,
+                               beta1, beta2, t, eps, wd, grad_scale, seed, main_stream);
+            }
+
+            if (multi_gpu_config->zero_stage == 1)
+            {
+#if MULTI_GPU
+                  ncclCheck(ncclGroupStart());
+                  for (int l = 0; l < num_layers; ++l)
+                  {
+
+                        ncclCheck(ncclAllGather(param_ptr + l * tensor.size,
+                                                (floatX *)model->params_memory + tensor.offset + l * tensor.size,
+                                                shard.size, ncclFloatX,
+                                                multi_gpu_config->nccl_comm, multi_gpu_config->nccl_stream));
+                  }
+                  ncclCheck(ncclGroupEnd());
+#endif
+            }
+      }
+
+      cudaCheck(cudaDeviceSynchronize());
+}
+
+float gpt2_estimate_mfu(GPT2 *model, int num_tokens, float dt)
+{
+
+      size_t N = model->num_parameters;
+      int L = model->config.num_layers;
+      int C = model->config.channels;
+      int T = model->seq_len;
+      size_t flops_per_token = 6 * N + (size_t)6 * L * C * T;
+      size_t flops_per_step = flops_per_token * num_tokens;
+
+      float flops_achieved = (float)flops_per_step * (1.0f / dt);
+      float flops_promised = get_flops_promised(deviceProp.name, PRECISION_MODE) * 1e12f;
+      if (flops_promised < 0)
+      {
+            return -1.f;
+      }
+      float mfu = flops_achieved / flops_promised;
+      return mfu;
+}
+
+void gpt2_free(GPT2 *model)
+{
+      cudaFreeCheck(&model->params_memory);
+      cudaFreeCheck(&model->grads_memory);
+      cudaFreeCheck(&model->m_memory);
+      cudaFreeCheck(&model->v_memory);
+      cudaFreeCheck(&model->master_weights);
+      cudaFreeCheck(&model->acts_memory);
+      cudaFreeCheck(&model->inputs);
+      cudaFreeCheck(&model->targets);
+      cudaFreeCheck(&model->accumulated_mean_loss);
+      cudaCheck(cudaFreeHost(model->cpu_losses));
+      free(model->workload_indices);
+      free(model->bucket_info);
+}
+
+void common_start(bool override_enable_tf32 = true, bool print_device_info = true)
+{
+
+      cudaCheck(cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx));
+      if (print_device_info)
+      {
+            printf("[System]\n");
+            printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name);
+      }
+
+      cudaCheck(cudaStreamCreate(&main_stream));
+      nvtxNameCudaStreamA(main_stream, "main stream");
+
+      cublasCheck(cublasLtCreate(&cublaslt_handle));
+      cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
+
+      bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32;
+      cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
+
+#ifdef ENABLE_CUDNN
+      create_cudnn();
+#endif
+}
+
+void common_free(GPT2 &model)
+{
+      cudaCheck(cudaStreamDestroy(main_stream));
+      cudaCheck(cudaFree(cublaslt_workspace));
+      cublasCheck(cublasLtDestroy(cublaslt_handle));
+#ifdef ENABLE_CUDNN
+      destroy_cudnn();
+#endif
+}
+
+void save_state(const char *filename, int step, GPT2 *model, DataLoader *loader)
+{
+      printf("Writing state to %s\n", filename);
+      FILE *state_file = fopenCheck(filename, "wb");
+      int state_header[256];
+      memset(state_header, 0, sizeof(state_header));
+
+      state_header[0] = 20240527;
+      state_header[1] = 1;
+      state_header[2] = multi_gpu_config.num_processes;
+      state_header[3] = multi_gpu_config.process_rank;
+      state_header[4] = model->use_master_weights;
+      state_header[5] = loader->should_shuffle;
+
+      state_header[10] = step;
+
+      *((unsigned long long *)&state_header[20]) = model->rng_state;
+      *((unsigned long long *)&state_header[22]) = model->rng_state_last_update;
+
+      *((size_t *)&state_header[30]) = loader->current_shard_idx;
+      *((size_t *)&state_header[32]) = loader->current_sample_idx;
+      fwriteCheck(state_header, sizeof(int), 256, state_file);
+
+      size_t shard_num_parameters = multi_gpu_config.shard_num_parameters;
+      device_to_file(state_file, model->m_memory, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+      device_to_file(state_file, model->v_memory, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+      if (model->use_master_weights)
+      {
+            device_to_file(state_file, model->master_weights, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+      }
+
+      if (loader->should_shuffle)
+      {
+            fwriteCheck(&loader->glob_result.gl_pathc, sizeof(size_t), 1, state_file);
+            fwriteCheck(loader->shard_indices, sizeof(int), loader->glob_result.gl_pathc, state_file);
+            fwriteCheck(&loader->shard_num_samples, sizeof(size_t), 1, state_file);
+            fwriteCheck(loader->intra_shard_indices, sizeof(int), loader->shard_num_samples, state_file);
+            fwriteCheck(&loader->shuffle_rng, sizeof(mt19937_state), 1, state_file);
+      }
+      fcloseCheck(state_file);
+}
+
+void load_state(int *step, GPT2 *model, DataLoader *loader, const char *filename)
+{
+      FILE *state_file = fopenCheck(filename, "rb");
+      int state_header[256];
+      freadCheck(state_header, sizeof(int), 256, state_file);
+      assert(state_header[0] == 20240527);
+      assert(state_header[1] == 1);
+      assert(state_header[2] == multi_gpu_config.num_processes);
+      assert(state_header[3] == multi_gpu_config.process_rank);
+      int use_master_weights = state_header[4];
+      int should_shuffle = state_header[5];
+      *step = state_header[10];
+      model->rng_state = *((unsigned long long *)&state_header[20]);
+      model->rng_state_last_update = *((unsigned long long *)&state_header[22]);
+      size_t current_shard_idx = *((size_t *)&state_header[30]);
+      size_t current_sample_idx = *((size_t *)&state_header[32]);
+
+      size_t shard_num_parameters = multi_gpu_config.shard_num_parameters;
+      if (use_master_weights == 1 && !model->use_master_weights)
+      {
+            printf0("Warning: Master weights are present in state, but not enabled for current run.");
+      }
+      else if (use_master_weights == 0 && model->use_master_weights)
+      {
+            printf0("Error: Master weights requested, but not present in state file.");
+            exit(EXIT_FAILURE);
+      }
+
+      model->init_state = false;
+      assert(model->m_memory != nullptr);
+      assert(model->v_memory != nullptr);
+      file_to_device(model->m_memory, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+      file_to_device(model->v_memory, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+      if (model->use_master_weights)
+      {
+            assert(model->master_weights != nullptr);
+            file_to_device(model->master_weights, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+
+            model->rng_state = model->rng_state_last_update;
+            gpt2_update(model, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0, &multi_gpu_config, true);
+            model->rng_state = *((unsigned long long *)&state_header[20]);
+      }
+
+      loader->should_shuffle = should_shuffle;
+      if (should_shuffle == 1)
+      {
+
+            size_t glob_result_gl_pathc;
+            freadCheck(&glob_result_gl_pathc, sizeof(size_t), 1, state_file);
+            assert(glob_result_gl_pathc == loader->glob_result.gl_pathc);
+
+            loader->shard_indices = (int *)mallocCheck(loader->glob_result.gl_pathc * sizeof(int));
+            freadCheck(loader->shard_indices, sizeof(int), loader->glob_result.gl_pathc, state_file);
+
+            size_t shard_num_samples;
+            freadCheck(&shard_num_samples, sizeof(size_t), 1, state_file);
+            assert(shard_num_samples == loader->shard_num_samples);
+
+            loader->intra_shard_indices = (int *)mallocCheck(loader->shard_num_samples * sizeof(int));
+            freadCheck(loader->intra_shard_indices, sizeof(int), loader->shard_num_samples, state_file);
+
+            freadCheck(&loader->shuffle_rng, sizeof(mt19937_state), 1, state_file);
+      }
+      dataloader_resume(loader, current_shard_idx, current_sample_idx);
+
+      fcloseCheck(state_file);
+}
+
+void write_checkpoint(const char *output_log_dir, int step, GPT2 *model, DataLoader *train_loader, MultiGpuConfig *multi_gpu_config)
+{
+
+      printf0("Writing checkpoint at step %d\n", step);
+      int rank = multi_gpu_config->process_rank;
+
+      if (rank == 0)
+      {
+            snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, step);
+            gpt2_write_to_checkpoint(model, filename_buffer);
+      }
+
+      snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, step, rank);
+      save_state(filename_buffer, step, model, train_loader);
+
+      multi_gpu_barrier(multi_gpu_config);
+      if (rank == 0)
+      {
+            snprintf(filename_buffer, sizeof(filename_buffer), "%s/DONE_%08d", output_log_dir, step);
+            FILE *done_file = fopenCheck(filename_buffer, "w");
+            fcloseCheck(done_file);
+      }
+}
+
+void delete_checkpoint(const char *output_log_dir, int step, MultiGpuConfig *multi_gpu_config)
+{
+
+      printf0("Deleting checkpoint at step %d\n", step);
+      int rank = multi_gpu_config->process_rank;
+      if (rank == 0)
+      {
+            snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, step);
+            remove(filename_buffer);
+      }
+      snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, step, rank);
+      remove(filename_buffer);
+      if (rank == 0)
+      {
+            snprintf(filename_buffer, sizeof(filename_buffer), "%s/DONE_%08d", output_log_dir, step);
+            remove(filename_buffer);
+      }
+}
+
+#ifndef TESTING
+
+void error_usage()
+{
+      fprintf(stderr, "Usage:   ./train_gpt2cu [options]\n");
+      fprintf(stderr, "Options:\n");
+
+      fprintf(stderr, "  -i <string> train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n");
+      fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
+      fprintf(stderr, "  -e <string> input .bin filename or descriptor, see code comments as docs. (default = gpt2_124M_bf16.bin)\n");
+      fprintf(stderr, "  -o <string> output log dir (default = NULL, no logging)\n");
+      fprintf(stderr, "  -lg <int>   log gpu info every x steps (default = -1; disabled)\n");
+      fprintf(stderr, "  -n <int>    write optimization checkpoints every how many steps? (default 0, don't)\n");
+      fprintf(stderr, "  -nk <int>   max number of checkpoints to keep in the directory, removing old ones (0 = disable, default)\n");
+      fprintf(stderr, "  -nm <int>   every how many step checkpoints are considered major? major checkpoints never get deleted.\n");
+      fprintf(stderr, "  -y <int>    resume optimization found inside output log dir? (0=restart/overwrite, 1=resume/append)\n");
+
+      fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");
+      fprintf(stderr, "  -t <int>    sequence length T (default = 1024)\n");
+      fprintf(stderr, "  -d <int>    total desired batch size (default = B * T * num_processes, i.e. no grad accumulation\n");
+
+      fprintf(stderr, "  -x <int>    max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n");
+
+      fprintf(stderr, "  -k <string> learning rate scheduler (default = cosine)\n");
+      fprintf(stderr, "  -l <float>  learning rate (default = 3e-4f)\n");
+      fprintf(stderr, "  -u <int>    learning rate warmup iterations (default = 0, no warmup)\n");
+      fprintf(stderr, "  -q <float>  learning rate decay: final fraction, at end of training (default = 1.0 (no decay))\n");
+      fprintf(stderr, "  -c <float>  weight decay (default = 0.0f)\n");
+      fprintf(stderr, "  -sl <float> outlier stability: skip update if loss goes above this in zscore (0.0f=off)\n");
+      fprintf(stderr, "  -sg <float> outlier stability: skip update if grad_norm goes above this in zscore (0.0f=off)\n");
+
+      fprintf(stderr, "  -v <int>    val_loss_every, how often we evaluate val loss (default = 20)\n");
+      fprintf(stderr, "  -m <int>    val_max_steps, up to how many val batches to estimate val loss? (default = 20)\n");
+      fprintf(stderr, "  -s <int>    sample_every, how often we inference the model (default = 20)\n");
+      fprintf(stderr, "  -g <int>    genT, how many steps of inference we do (default = 64)\n");
+      fprintf(stderr, "  -h <int>    hellaswag eval run? (default = 0)\n");
+
+      fprintf(stderr, "  -a <int>    overfit a single batch? 0/1. useful for debugging\n");
+
+      fprintf(stderr, "  -f <int>    enable_tf32 override (default: 1, set to 0 to disable tf32)\n");
+      fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer? (default: 1)\n");
+      fprintf(stderr, "  -ge <int>   gelu fusion: 0=none, 1=forward, 2=forward+backward (default: 2 for >=SM90, 0 for older GPUs)\n");
+
+      fprintf(stderr, "  -z <int>    zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n");
+      fprintf(stderr, "  -r <int>    recompute: less memory but less speed. (default = 1), 0|1|2 = none,gelu,gelu+ln\n");
+
+      fprintf(stderr, "  -pn <int>    num_processes (default = 1)\n");
+      fprintf(stderr, "  -pr <int>    process_rank (default = 0)\n");
+      fprintf(stderr, "  -pg <int>    gpus_per_node (default = 8)\n");
+      fprintf(stderr, "  -pm <string> nccl_init_method: tcp,fs,mpi (default = mpi)\n");
+      fprintf(stderr, "  -ps <string> server_ip - used only when nccl_init_method is tcp (default = -1)\n");
+      fprintf(stderr, "  -pp <string> fs_path - used only when nccl_init_method is fs (default = /tmp)\n");
+      exit(EXIT_FAILURE);
+}
+
+int main(int argc, char *argv[])
+{
+
+      const char *train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
+      const char *val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
+      const char *load_filename = "gpt2_124M_bf16.bin";
+      const char *lr_scheduler_type = "cosine";
+      const char *output_log_dir = NULL;
+      int checkpoint_every = 0;
+      int checkpoints_keep = 0;
+      int major_checkpoint_every = 0;
+      int resume = 0;
+      int B = 4;
+      int T = 1024;
+      int total_batch_size = -1;
+      float learning_rate = 3e-4f;
+      int log_gpu_every = -1;
+      int warmup_iterations = 0;
+      float final_learning_rate_frac = 1.0f;
+      float weight_decay = 0.0f;
+      float skip_update_lossz = 0.0f;
+      float skip_update_gradz = 0.0f;
+      int val_loss_every = 20;
+      int val_max_steps = 20;
+      int sample_every = 20;
+      int genT = 64;
+      int overfit_single_batch = 0;
+      int max_steps = -1;
+      int override_enable_tf32 = 1;
+      int use_master_weights = 1;
+      int gelu_fusion = -1;
+      int recompute = 1;
+      int zero_stage = 0;
+      int hellaswag_eval = 0;
+
+      int num_processes = 1;
+      int process_rank = 0;
+      int gpus_per_node = 8;
+      char nccl_init_method[256] = "mpi";
+      char server_ip[256] = "";
+      char fs_path[256] = "";
+      for (int i = 1; i < argc; i += 2)
+      {
+            if (i + 1 >= argc)
+            {
+                  error_usage();
+            }
+            if (argv[i][0] != '-')
+            {
+                  error_usage();
+            }
+            if (!(strlen(argv[i]) == 2 || strlen(argv[i]) == 3))
+            {
+                  error_usage();
+            }
+
+            if (argv[i][1] == 'i')
+            {
+                  train_data_pattern = argv[i + 1];
+            }
+            else if (argv[i][1] == 'j')
+            {
+                  val_data_pattern = argv[i + 1];
+            }
+            else if (argv[i][1] == 'e')
+            {
+                  load_filename = argv[i + 1];
+            }
+            else if (argv[i][1] == 'o')
+            {
+                  output_log_dir = argv[i + 1];
+            }
+            else if (argv[i][1] == 'n' && argv[i][2] == '\0')
+            {
+                  checkpoint_every = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'y')
+            {
+                  resume = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'b')
+            {
+                  B = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 't')
+            {
+                  T = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'd')
+            {
+                  total_batch_size = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'l' && argv[i][2] == '\0')
+            {
+                  learning_rate = atof(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'l' && argv[i][2] == 'g')
+            {
+                  log_gpu_every = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'u')
+            {
+                  warmup_iterations = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'q')
+            {
+                  final_learning_rate_frac = atof(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'c')
+            {
+                  weight_decay = atof(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'x')
+            {
+                  max_steps = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'v')
+            {
+                  val_loss_every = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'm')
+            {
+                  val_max_steps = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 's' && argv[i][2] == '\0')
+            {
+                  sample_every = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'g' && argv[i][2] == 'e')
+            {
+                  gelu_fusion = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'g')
+            {
+                  genT = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'a')
+            {
+                  overfit_single_batch = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'f')
+            {
+                  override_enable_tf32 = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'w')
+            {
+                  use_master_weights = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'z')
+            {
+                  zero_stage = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'r')
+            {
+                  recompute = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'h')
+            {
+                  hellaswag_eval = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'k')
+            {
+                  lr_scheduler_type = argv[i + 1];
+            }
+            else if (argv[i][1] == 'p' && argv[i][2] == 'i')
+            {
+                  strcpy(nccl_init_method, argv[i + 1]);
+            }
+            else if (argv[i][1] == 'p' && argv[i][2] == 'f')
+            {
+                  strcpy(fs_path, argv[i + 1]);
+            }
+            else if (argv[i][1] == 'p' && argv[i][2] == 's')
+            {
+                  strcpy(server_ip, argv[i + 1]);
+            }
+            else if (argv[i][1] == 'p' && argv[i][2] == 'n')
+            {
+                  num_processes = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'p' && argv[i][2] == 'r')
+            {
+                  process_rank = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'p' && argv[i][2] == 'g')
+            {
+                  gpus_per_node = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 's' && argv[i][2] == 'l')
+            {
+                  skip_update_lossz = atof(argv[i + 1]);
+            }
+            else if (argv[i][1] == 's' && argv[i][2] == 'g')
+            {
+                  skip_update_gradz = atof(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'n' && argv[i][2] == 'k')
+            {
+                  checkpoints_keep = atoi(argv[i + 1]);
+            }
+            else if (argv[i][1] == 'n' && argv[i][2] == 'm')
+            {
+                  major_checkpoint_every = atoi(argv[i + 1]);
+            }
+            else
+            {
+                  error_usage();
+            }
+      }
+
+      multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, server_ip, fs_path, nccl_init_method);
+      common_start(override_enable_tf32, false);
+
+      assert(warmup_iterations >= 0);
+      if (output_log_dir != NULL)
+      {
+            assert(strlen(output_log_dir) < 400);
+      }
+      int tokens_per_fwdbwd = B * T * multi_gpu_config.num_processes;
+
+      if (total_batch_size == -1)
+      {
+            total_batch_size = tokens_per_fwdbwd;
+      }
+
+      if (gelu_fusion == -1)
+      {
+            gelu_fusion = 0;
+      }
+
+      assert(total_batch_size % tokens_per_fwdbwd == 0);
+      int grad_accum_steps = total_batch_size / tokens_per_fwdbwd;
+
+      if (overfit_single_batch == 1)
+      {
+            train_data_pattern = val_data_pattern;
+      }
+      printf0("+-----------------------+----------------------------------------------------+\n");
+      printf0("| Parameter             | Value                                              |\n");
+      printf0("+-----------------------+----------------------------------------------------+\n");
+      printf0("| train data pattern    | %-50s |\n", train_data_pattern);
+      printf0("| val data pattern      | %-50s |\n", val_data_pattern);
+      printf0("| output log dir        | %-50s |\n", output_log_dir == NULL ? "NULL" : output_log_dir);
+      printf0("| checkpoint_every      | %-50d |\n", checkpoint_every);
+      printf0("| resume                | %-50d |\n", resume);
+      printf0("| micro batch size B    | %-50d |\n", B);
+      printf0("| sequence length T     | %-50d |\n", T);
+      printf0("| total batch size      | %-50d |\n", total_batch_size);
+      printf0("| LR scheduler          | %-50s |\n", lr_scheduler_type);
+      printf0("| learning rate (LR)    | %-50e |\n", learning_rate);
+      printf0("| warmup iterations     | %-50d |\n", warmup_iterations);
+      printf0("| final LR fraction     | %-50e |\n", final_learning_rate_frac);
+      printf0("| weight decay          | %-50e |\n", weight_decay);
+      printf0("| skip update lossz     | %-50f |\n", skip_update_lossz);
+      printf0("| skip update gradz     | %-50f |\n", skip_update_gradz);
+      printf0("| max_steps             | %-50d |\n", max_steps);
+      printf0("| val_loss_every        | %-50d |\n", val_loss_every);
+      printf0("| val_max_steps         | %-50d |\n", val_max_steps);
+      printf0("| sample_every          | %-50d |\n", sample_every);
+      printf0("| genT                  | %-50d |\n", genT);
+      printf0("| overfit_single_batch  | %-50d |\n", overfit_single_batch);
+      printf0("| use_master_weights    | %-50s |\n", use_master_weights ? "enabled" : "disabled");
+      printf0("| gelu_fusion           | %-50d |\n", gelu_fusion);
+      printf0("| recompute             | %-50d |\n", recompute);
+      printf0("+-----------------------+----------------------------------------------------+\n");
+      const char *precision_str = (PRECISION_MODE == PRECISION_FP32)
+                                      ? (cublas_compute == CUBLAS_COMPUTE_32F_FAST_TF32 ? "TF32" : "FP32")
+                                      : (PRECISION_MODE == PRECISION_FP16 ? "FP16" : "BF16");
+      printf0("| device                | %-50s |\n", deviceProp.name);
+      printf0("| peak TFlops           | %-50.1f |\n", get_flops_promised(deviceProp.name, PRECISION_MODE));
+      printf0("| precision             | %-50s |\n", precision_str);
+      printf0("+-----------------------+----------------------------------------------------+\n");
+
+      int resuming = 0;
+
+      int resume_max_step = find_max_step(output_log_dir);
+      if (resume == 1)
+      {
+            assert(output_log_dir != NULL);
+            if (resume_max_step != -1)
+            {
+                  resuming = 1;
+                  snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, resume_max_step);
+            }
+      }
+
+      GPT2 model;
+      gpt2_init_common(&model);
+      if (resuming == 1)
+      {
+
+            bool weight_init = !use_master_weights;
+            gpt2_build_from_checkpoint(&model, filename_buffer, weight_init);
+      }
+      else if (ends_with_bin(load_filename))
+      {
+
+            gpt2_build_from_checkpoint(&model, load_filename);
+      }
+      else
+      {
+
+            gpt_build_from_descriptor(&model, load_filename);
+      }
+
+      model.use_master_weights = use_master_weights;
+      model.gelu_fusion = gelu_fusion;
+      model.recompute = recompute;
+      printf0("| weight init method    | %-50s |\n", resuming == 1 ? "intermediate checkpoint" : load_filename);
+      printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len);
+      printf0("| vocab_size V          | %-50d |\n", model.config.vocab_size);
+      printf0("| padded_vocab_size Vp  | %-50d |\n", model.config.padded_vocab_size);
+      printf0("| num_layers L          | %-50d |\n", model.config.num_layers);
+      printf0("| num_heads NH          | %-50d |\n", model.config.num_heads);
+      printf0("| channels C            | %-50d |\n", model.config.channels);
+      printf0("| num_parameters        | %-50zu |\n", model.num_parameters);
+      printf0("+-----------------------+----------------------------------------------------+\n");
+
+      int permute_train_loader = (overfit_single_batch == 1) ? 0 : 1;
+      DataLoader train_loader, val_loader;
+      dataloader_init(&train_loader, train_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes, permute_train_loader);
+      dataloader_init(&val_loader, val_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes, 0);
+
+      int train_num_batches = max_steps;
+      if (train_num_batches == -1)
+      {
+
+            size_t ntok = train_loader.num_tokens;
+
+            train_num_batches = ntok / total_batch_size;
+      }
+
+      int val_num_batches = val_max_steps;
+      if (val_num_batches == -1)
+      {
+
+            size_t ntok = val_loader.num_tokens;
+
+            val_num_batches = ntok / tokens_per_fwdbwd;
+      }
+      printf0("| train_num_batches     | %-50d |\n", train_num_batches);
+      printf0("| val_num_batches       | %-50d |\n", val_num_batches);
+      printf0("+-----------------------+----------------------------------------------------+\n");
+
+      EvalLoader eval_loader;
+      const char *hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin";
+      const bool hellaswag_available = access(hellaswag_path, F_OK) == 0;
+      const bool run_hellaswag = hellaswag_eval && hellaswag_available;
+      if (run_hellaswag)
+      {
+            evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+      }
+      printf0("| run hellaswag         | %-50s |\n", run_hellaswag ? "yes" : "no");
+      printf0("+-----------------------+----------------------------------------------------+\n");
+
+      set_zero_configs(&multi_gpu_config, zero_stage, model.num_parameters);
+      printf0("| num_processes         | %-50d |\n", multi_gpu_config.num_processes);
+      printf0("| zero_stage            | %-50d |\n", multi_gpu_config.zero_stage);
+      printf0("+-----------------------+----------------------------------------------------+\n");
+
+      if (!hellaswag_available)
+      {
+            printf0("HellaSwag eval not found at %s, skipping its evaluation\n", hellaswag_path);
+            printf0("You can run `python dev/data/hellaswag.py` to export and use it with `-h 1`.\n");
+      }
+
+      printf0("num_parameters: %zu => bytes: %zu\n", model.num_parameters, model.num_parameters_bytes);
+      printf0("allocated %d MiB for model parameters\n", (int)round(model.num_parameters_bytes / (1024 * 1024)));
+
+      printf0("batch_size B=%d * seq_len T=%d * num_processes=%d and total_batch_size=%d\n",
+              B, T, multi_gpu_config.num_processes, total_batch_size);
+      printf0("=> setting grad_accum_steps=%d\n", grad_accum_steps);
+
+      if (multi_gpu_config.process_rank == 0)
+      {
+            create_dir_if_not_exists(output_log_dir);
+      }
+      Logger logger;
+      logger_init(&logger, output_log_dir, multi_gpu_config.process_rank, resume);
+
+      Tokenizer tokenizer;
+      tokenizer_init(&tokenizer, "gpt2_tokenizer.bin");
+
+      LearningRateScheduler lr_scheduler;
+      lr_scheduler_init(&lr_scheduler, lr_scheduler_type, learning_rate,
+                        warmup_iterations, train_num_batches, final_learning_rate_frac);
+
+      int *gen_tokens = (int *)mallocCheck(B * T * sizeof(int));
+      floatX *cpu_logits_raw = (floatX *)mallocCheck(model.config.vocab_size * sizeof(floatX));
+      float *cpu_logits = (float *)mallocCheck(model.config.vocab_size * sizeof(float));
+
+      int step = 0;
+      gpt2_allocate_state(&model, B, T);
+      if (resuming == 1)
+      {
+            snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, resume_max_step, multi_gpu_config.process_rank);
+            load_state(&step, &model, &train_loader, filename_buffer);
+      }
+
+      OutlierDetector loss_outlier_detector, grad_norm_outlier_detector;
+      init_detector(&loss_outlier_detector);
+      init_detector(&grad_norm_outlier_detector);
+
+      if (T < model.config.max_seq_len)
+      {
+            printf0("!!!!!!!!\n");
+            printf0("WARNING:\n");
+            printf0("- The training sequence length is: T=%d (set with -t)\n", T);
+            printf0("- The model's max sequence length is: max_seq_len=%d\n", model.config.max_seq_len);
+            printf0("You are attempting to train with a sequence length shorter than the model's max.\n");
+            printf0("This will lead to unused parameters in the wpe position embedding weights.\n");
+            printf0("If you know what you're doing you can ignore this warning.\n");
+            printf0("If you're like ???, you are most likely misconfiguring your training run.\n");
+            printf0("---> HINT: If you're training GPT-2 use -t 1024. If GPT-3, use -t 2048.\n");
+            printf0("!!!!!!!!\n");
+      }
+
+      assert(T <= model.config.max_seq_len);
+
+      cudaEvent_t start, end;
+      cudaCheck(cudaEventCreate(&start));
+      cudaCheck(cudaEventCreate(&end));
+      cudaCheck(cudaProfilerStart());
+      double total_sum_iteration_time_s = 0.0;
+      float ema_tokens_per_second = 0.0f;
+      for (; step <= train_num_batches; step++)
+      {
+            NvtxRange step_range("Train step", step);
+
+            int last_step = step == train_num_batches;
+
+            if (step % val_loss_every == 0 || last_step)
+            {
+                  NvtxRange validation_range("validation");
+                  float val_loss = 0.0f;
+                  dataloader_reset(&val_loader);
+                  for (int i = 0; i < val_num_batches; i++)
+                  {
+                        dataloader_next_batch(&val_loader);
+                        val_loss += gpt2_validate(&model, val_loader.inputs, val_loader.targets, B, T);
+                  }
+                  val_loss /= val_num_batches;
+                  val_loss = multi_gpu_cpu_float_sum(val_loss, &multi_gpu_config) / multi_gpu_config.num_processes;
+                  printf0("val loss %f\n", val_loss);
+                  logger_log_val(&logger, step, val_loss);
+            }
+
+            if (run_hellaswag &&
+                ((step > 0 && step % val_loss_every == 0) || last_step))
+            {
+                  NvtxRange evaluation_range("evaluation");
+                  float eval_acc_norm = 0.0f;
+                  evalloader_reset(&eval_loader);
+                  for (int i = 0; i < eval_loader.num_batches; i++)
+                  {
+                        if (i % 10 == 0)
+                        {
+                              printf("evaluating HellaSwag: %d/%d\r", i, eval_loader.num_batches);
+                        }
+                        evalloader_next_batch(&eval_loader);
+                        gpt2_validate(&model, eval_loader.inputs, eval_loader.targets, B, T);
+                        int correct = evalloader_stat_losses(&eval_loader, model.cpu_losses);
+                        eval_acc_norm += (float)correct;
+                  }
+
+                  eval_acc_norm = multi_gpu_cpu_float_sum(eval_acc_norm, &multi_gpu_config);
+                  printf0("HellaSwag: %d/%d = %f\n", (int)eval_acc_norm, eval_loader.num_examples, eval_acc_norm / eval_loader.num_examples);
+                  logger_log_eval(&logger, step, eval_acc_norm / eval_loader.num_examples);
+            }
+
+            if (multi_gpu_config.process_rank == 0 && sample_every > 0 &&
+                (step > 0 && (step % sample_every) == 0 || last_step))
+            {
+                  NvtxRange generation_range("generation");
+                  unsigned long long sample_rng_state = 1337;
+
+                  int eot_token = tokenizer.eot_token;
+                  for (int i = 0; i < B * T; ++i)
+                  {
+                        gen_tokens[i] = eot_token;
+                  }
+
+                  printf("generating:\n---\n");
+                  for (int t = 1; t < genT; t++)
+                  {
+                        NvtxRange generation_range("Generation step", t);
+
+                        gpt2_forward(&model, gen_tokens, 1, CEIL_DIV(t, min(T, 256)) * min(T, 256));
+
+                        floatX *logits = model.acts.output + (t - 1) * model.config.padded_vocab_size;
+
+                        cudaCheck(cudaMemcpy(cpu_logits_raw, logits, model.config.vocab_size * sizeof(floatX), cudaMemcpyDeviceToHost));
+
+                        for (int i = 0; i < model.config.vocab_size; i++)
+                        {
+                              cpu_logits[i] = (float)cpu_logits_raw[i];
+                        }
+
+                        float coin = random_f32(&sample_rng_state);
+                        int next_token = sample_softmax(cpu_logits, model.config.vocab_size, coin);
+                        gen_tokens[t] = next_token;
+
+                        if (tokenizer.init_ok)
+                        {
+                              const char *token_str = tokenizer_decode(&tokenizer, next_token);
+                              safe_printf(token_str);
+                        }
+                        else
+                        {
+
+                              printf("%d ", next_token);
+                        }
+                        fflush(stdout);
+                  }
+                  printf("\n---\n");
+            }
+
+            if ((checkpoint_every > 0 && output_log_dir != NULL && resuming == 0) &&
+                ((step > 0 && step % checkpoint_every == 0) || last_step))
+            {
+
+                  write_checkpoint(output_log_dir, step, &model, &train_loader, &multi_gpu_config);
+
+                  int step_delete = step - checkpoints_keep * checkpoint_every;
+                  if (checkpoints_keep > 0 && step_delete > 0 &&
+                      (major_checkpoint_every == 0 || step_delete % major_checkpoint_every != 0))
+                  {
+                        delete_checkpoint(output_log_dir, step_delete, &multi_gpu_config);
+                  }
+            }
+            resuming = 0;
+
+            if (last_step)
+            {
+                  break;
+            }
+
+            if (overfit_single_batch == 1)
+            {
+
+                  dataloader_reset(&train_loader);
+            }
+
+            cudaCheck(cudaEventRecord(start));
+
+            for (int micro_step = 0; micro_step < grad_accum_steps; micro_step++)
+            {
+
+                  dataloader_next_batch(&train_loader);
+
+                  gpt2_forward(&model, train_loader.inputs, B, T);
+
+                  gpt2_backward_and_reduce(&model, train_loader.inputs, train_loader.targets, grad_accum_steps, micro_step);
+            }
+            float zloss = (float)(update_detector(&loss_outlier_detector, (double)model.mean_loss));
+
+            float step_learning_rate = get_learning_rate(&lr_scheduler, step);
+
+            float grad_norm = gpt2_calculate_grad_norm(&model, &multi_gpu_config);
+            float zgrad = (float)(update_detector(&grad_norm_outlier_detector, (double)grad_norm));
+
+            if (isfinite(zloss) && skip_update_lossz != 0.0f && zloss > skip_update_lossz)
+            {
+                  printf0("skipping update due to loss z-score of %f\n", zloss);
+            }
+            else if (isfinite(zgrad) && skip_update_gradz != 0.0f && zgrad > skip_update_gradz)
+            {
+                  printf0("skipping update due to grad z-score of %f\n", zgrad);
+            }
+            else
+            {
+
+                  float grad_clip = 1.0f;
+                  float grad_scale = (grad_norm > grad_clip) ? grad_clip / grad_norm : 1.0f;
+                  gpt2_update(&model, step_learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, grad_scale, step + 1, &multi_gpu_config);
+            }
+            cudaCheck(cudaEventRecord(end));
+            cudaCheck(cudaEventSynchronize(end));
+
+            float time_elapsed_ms;
+            cudaCheck(cudaEventElapsedTime(&time_elapsed_ms, start, end));
+            size_t tokens_processed = (size_t)multi_gpu_config.num_processes * B * T * grad_accum_steps;
+            float tokens_per_second = tokens_processed / time_elapsed_ms * 1000.0f;
+            float bias_corrected_ema_tokens_per_second = tokens_per_second;
+            if (step > 0)
+            {
+                  total_sum_iteration_time_s += time_elapsed_ms / 1000.0f;
+
+                  ema_tokens_per_second = 0.95f * ema_tokens_per_second + 0.05f * tokens_per_second;
+                  bias_corrected_ema_tokens_per_second = ema_tokens_per_second / (1.0f - powf(0.95f, step));
+            }
+            float mfu = gpt2_estimate_mfu(&model, B * T * grad_accum_steps, time_elapsed_ms / 1000.0f);
+            printf0("step %4d/%d | loss %7.6f (%+.2fz)| norm %6.4f (%+.2fz)| lr %.2e | %.2f ms | %.1f%% bf16 MFU | %.0f tok/s\n",
+                    step + 1, train_num_batches, model.mean_loss, zloss, grad_norm, zgrad, step_learning_rate,
+                    time_elapsed_ms, 100 * mfu, bias_corrected_ema_tokens_per_second);
+            if (log_gpu_every > 0 && (step + 1) % log_gpu_every == 0)
+            {
+                  GPUUtilInfo gpu_info = get_gpu_utilization_info();
+                  printf0("                  compute %2.1f%% | memory: %2.1f%% | fan: %2d%% | %4d MHz / %4d MHz | %3d W / %3d W | %d°C / %d°C | %s\n",
+                          gpu_info.gpu_utilization, gpu_info.mem_utilization, gpu_info.fan, gpu_info.clock, gpu_info.max_clock, gpu_info.power / 1000, gpu_info.power_limit / 1000,
+                          gpu_info.temperature, gpu_info.temp_slowdown, gpu_info.throttle_reason);
+            }
+            logger_log_train(&logger, step, model.mean_loss, step_learning_rate, grad_norm);
+
+            if (step == 3)
+            {
+                  cudaProfilerStop();
+            }
+      }
+
+      printf0("total average iteration time: %f ms\n", total_sum_iteration_time_s / (train_num_batches - 1) * 1000);
+
+      cudaCheck(cudaEventDestroy(end));
+      cudaCheck(cudaEventDestroy(start));
+      if (run_hellaswag)
+      {
+            evalloader_free(&eval_loader);
+      }
+      dataloader_free(&train_loader);
+      dataloader_free(&val_loader);
+      tokenizer_free(&tokenizer);
+      free(cpu_logits_raw);
+      free(cpu_logits);
+      free(gen_tokens);
+      multi_gpu_config_free(&multi_gpu_config);
+      gpt2_free(&model);
+      common_free(model);
+      return 0;
+}
+#endif
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 804d8ed..c3222c6 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2026 Eamon
+Copyright(c) 2026 Eamon
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..ccb6702
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,104 @@
+# =============================================================================
+# Quadtrix.cpp — Makefile  (llama.cpp-style convenience targets)
+# =============================================================================
+
+.PHONY: all build clean run dev gpu train bench logs ps shell help
+
+SHELL := /bin/bash
+SCRIPT := ./scripts/build.sh
+
+# ── Native C++ ───────────────────────────────────────────────────────────────
+CC     := g++
+CFLAGS := -std=c++17 -O3 -march=native
+IFLAGS := -I. -Iinclude
+TARGET := quadtrix
+SRCS   := main.cpp
+
+all: $(TARGET)
+
+$(TARGET): $(SRCS)
+	$(CC) $(CFLAGS) $(IFLAGS) -o $@ $^
+	@echo "✓ Built $(TARGET)"
+
+# Optimised release (same flags, explicit target)
+release: $(SRCS)
+	$(CC) $(CFLAGS) $(IFLAGS) -DNDEBUG -o $(TARGET) $^
+	strip $(TARGET)
+
+# Debug build
+debug: $(SRCS)
+	$(CC) -std=c++17 -O0 -g -fsanitize=address,undefined \
+	      $(IFLAGS) -o $(TARGET)-debug $^
+
+benchmark-bin: benchmark.cpp
+	$(CC) $(CFLAGS) $(IFLAGS) -o quadtrix-bench $^
+
+clean-native:
+	rm -f $(TARGET) $(TARGET)-debug quadtrix-bench
+
+# ── Docker / Compose targets ─────────────────────────────────────────────────
+build:
+	$(SCRIPT) up
+
+run: build
+	@echo "Stack already started."
+
+dev:
+	$(SCRIPT) dev
+
+gpu:
+	$(SCRIPT) gpu
+
+train-cpp:
+	$(SCRIPT) train-cpp
+
+train-torch:
+	$(SCRIPT) train-torch
+
+bench:
+	$(SCRIPT) bench
+
+logs:
+	$(SCRIPT) logs
+
+ps:
+	$(SCRIPT) ps
+
+shell:
+	$(SCRIPT) shell $(SERVICE)
+
+clean:
+	$(SCRIPT) clean
+
+# ── Misc ─────────────────────────────────────────────────────────────────────
+format:
+	find . \( -name "*.cpp" -o -name "*.h" \) \
+	  ! -path "./build/*" \
+	  | xargs clang-format -i --style=LLVM
+
+lint-py:
+	ruff check backend/ engine/
+
+help:
+	@echo ""
+	@echo "  Quadtrix.cpp — make targets"
+	@echo ""
+	@echo "  Native:"
+	@echo "    make              Build C++ binary (native)"
+	@echo "    make release      Stripped release binary"
+	@echo "    make debug        Debug binary with ASan/UBSan"
+	@echo "    make clean-native Remove native build artifacts"
+	@echo "    make format       Run clang-format on all C++ files"
+	@echo ""
+	@echo "  Docker:"
+	@echo "    make build        docker compose up --build (CPU)"
+	@echo "    make dev          Hot-reload dev stack"
+	@echo "    make gpu          CUDA GPU stack"
+	@echo "    make train-cpp    Train with C++ inside Docker"
+	@echo "    make train-torch  Train with PyTorch inside Docker"
+	@echo "    make bench        Run benchmark"
+	@echo "    make logs         Tail all logs"
+	@echo "    make ps           Show container status"
+	@echo "    make shell        Shell into backend (SERVICE=frontend to change)"
+	@echo "    make clean        Remove containers + volumes"
+	@echo ""
diff --git a/README.md b/README.md
index 0feeebe..56f99cc 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,9 @@
 # Quadtrix.cpp
 
+<p align="center">
+  <img width="785" height="261" alt="image" src="https://github.com/user-attachments/assets/7bd2d8c6-d1e3-4ca0-96c0-0161d3cf235a" />
+</p>
+
 A local large language model with a modular, multi-path execution architecture. Train, run inference, and serve a chat interface — all from a single repository, across bare-metal C++, PyTorch, and a React frontend.
 
 > Full technical reference: [docs](https://eamon2009.github.io/LLMs/)
diff --git a/config/config.h b/config/config.h
index db053cb..844efeb 100644
--- a/config/config.h
+++ b/config/config.h
@@ -1,34 +1,18 @@
 #pragma once
-// ============================================================
-//  config/config.h  –  Global constants (mirrors config/config.py)
-// ============================================================
-
 #include <string>
-
-// ── Paths ────────────────────────────────────────────────────
-// Set CLEANED_PATH to your input text file before compiling,
-// or override at runtime via the env-var GPT_DATA_PATH.
 static const std::string DEFAULT_CLEANED_PATH = "data/input.txt";
 static const std::string DATA_PATH_ENV_VAR = "GPT_DATA_PATH";
-
-// ── Reproducibility ──────────────────────────────────────────
 static const unsigned int SEED = 1337;
-
-// ── Data split ───────────────────────────────────────────────
 static const double TRAIN_SPLIT = 0.9; // 90 % train, 10 % val
-
-// ── Hyper-parameters (identical to the Python script) ───────
 static const int BATCH_SIZE = 4;
 static const int BLOCK_SIZE = 64; // context length
-static const int MAX_ITERS = 3000;
+static const int MAX_ITERS = 10000;
 static const int EVAL_INTERVAL = 20;
 static const float LEARNING_RATE = 3e-4f;
-static const int EVAL_ITERS = 10;
+static const int EVAL_ITERS = 1;
 static const int N_EMBD = 128;
 static const int N_HEAD = 4;
 static const int N_LAYER = 4;
 static const float DROPOUT = 0.2f; // applied during training only
-
-// ── Output paths ─────────────────────────────────────────────
 static const std::string BEST_MODEL_PATH = "best_model.bin";
 static const std::string MODEL_PATH_ENV_VAR = "GPT_MODEL_PATH";
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 0000000..a2e9a85
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,45 @@
+services:
+  frontend:
+    build:
+      context: .
+      dockerfile: .devops/Dockerfile.dev.frontend
+    ports:
+      - "5173:5173"
+    volumes:
+      - ./frontend:/app:delegated
+      - /app/node_modules
+    environment:
+      VITE_API_BASE_URL: "http://localhost:3001"
+    command: [ "npm", "run", "dev", "--", "--host", "0.0.0.0" ]
+    healthcheck:
+      test: [ "CMD", "wget", "-qO-", "http://localhost:5173/" ]
+      interval: 15s
+      timeout: 5s
+      retries: 5
+
+  backend:
+    volumes:
+      - ./backend:/app/backend:delegated
+      - ./engine:/app/engine:delegated
+      - models:/models
+    environment:
+      LOG_LEVEL: DEBUG
+      CORS_ORIGINS: "http://localhost:5173,http://localhost:3001"
+    command:
+      - python
+      - -m
+      - uvicorn
+      - main:app
+      - --host
+      - "0.0.0.0"
+      - --port
+      - "3001"
+      - --reload
+      - --reload-dir
+      - /app/backend
+
+  redis:
+    ports:
+      - "6379:6379"
+volumes:
+  models:
diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
new file mode 100644
index 0000000..abbd02e
--- /dev/null
+++ b/docker-compose.gpu.yml
@@ -0,0 +1,32 @@
+services:
+  backend:
+    build:
+      args:
+        CUDA: "1"
+    image: quadtrix/backend-cuda:latest
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]
+    environment:
+      CUDA_VISIBLE_DEVICES: "0"
+      TORCH_CHECKPOINT_PATH: /models/best_model.pt
+
+  train-torch:
+    build:
+      args:
+        CUDA: "1"
+    image: quadtrix/backend-cuda:latest
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]
+    environment:
+      CUDA_VISIBLE_DEVICES: "0"
+      QUADTRIX_TRAIN_DATA: /app/data/input.txt
diff --git a/docker-compose.yml b/docker-compose.yml
index 8191856..7bb3572 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,34 +1,173 @@
+name: quadtrix
+
+x-common-env: &common-env
+  TZ: UTC
+  PYTHONUNBUFFERED: "1"
+
 services:
-  quadtrix:
-    image: ghcr.io/eamon2009/quadtrix.cpp:latest
+
+  frontend:
     build:
       context: .
-      dockerfile: Dockerfile
+      dockerfile: .devops/Dockerfile.frontend
       args:
-        # for cuda
-        # BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu24.04
-        BASE_IMAGE: ubuntu:24.04
-
+        VITE_API_BASE_URL: ""
+    image: quadtrix/frontend:latest
+    container_name: quadtrix-frontend
+    restart: unless-stopped
     ports:
-      - "3001:3001" # FastAPI backend
-      - "8080:8080" # React frontend
-
-    volumes:
-      # Place best_model.pt and/or best_model.bin inside ./models/
-      - ./models:/app/models
+      - "5173:80"
+    depends_on:
+      backend:
+        condition: service_healthy
+    networks:
+      - quadtrix-net
+    healthcheck:
+      test: [ "CMD", "wget", "-qO-", "http://localhost/" ]
+      interval: 30s
+      timeout: 5s
+      retries: 3
 
+  backend:
+    build:
+      context: .
+      dockerfile: .devops/Dockerfile.backend
+    image: quadtrix/backend:latest
+    container_name: quadtrix-backend
+    restart: unless-stopped
+    ports:
+      - "3001:3001"
     environment:
-      TORCH_CHECKPOINT_PATH: /app/models/best_model.pt
-      GPT_MODEL_PATH: /app/models/best_model.bin
-      CORS_ORIGINS: http://localhost:8080
+      <<: *common-env
+      API_PORT: "3001"
+      CORS_ORIGINS: "http://localhost:5173,http://frontend"
+      REDIS_URL: "redis://redis:6379/0"
+      TORCH_CHECKPOINT_PATH: /models/best_model.pt
       LOG_LEVEL: INFO
-      MAX_SESSIONS: 1000
-      SESSION_TTL_HOURS: 24
-    restart: unless-stopped
-
+      MAX_SESSIONS: "500"
+      SESSION_TTL_HOURS: "24"
+    volumes:
+      - models:/models
+      - ./engine:/app/engine:ro
+    depends_on:
+      redis:
+        condition: service_healthy
+    networks:
+      - quadtrix-net
     healthcheck:
       test: [ "CMD", "curl", "-f", "http://localhost:3001/api/health" ]
       interval: 30s
       timeout: 10s
-      retries: 5
       start_period: 20s
+      retries: 3
+
+  redis:
+    image: redis:7-alpine
+    container_name: quadtrix-redis
+    restart: unless-stopped
+    command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru
+    volumes:
+      - redis-data:/data
+    networks:
+      - quadtrix-net
+    healthcheck:
+      test: [ "CMD", "redis-cli", "ping" ]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    expose:
+      - "6379"
+
+  cpp:
+    build:
+      context: .
+      dockerfile: .devops/Dockerfile.cpp
+    image: quadtrix/cpp:latest
+    container_name: quadtrix-cpp
+
+    restart: "no"
+    stdin_open: true
+    tty: true
+    volumes:
+      - models:/models
+      - ./data:/app/data:ro
+    environment:
+      <<: *common-env
+      GPT_DATA_PATH: /app/data/input.txt
+      GPT_MODEL_PATH: /models/best_model.bin
+    networks:
+      - quadtrix-net
+    profiles:
+      - cpp
+
+  train-cpp:
+    build:
+      context: .
+      dockerfile: .devops/Dockerfile.cpp
+    image: quadtrix/cpp:latest
+    container_name: quadtrix-train-cpp
+    restart: "no"
+    volumes:
+      - models:/models
+      - ./data:/app/data:ro
+    environment:
+      <<: *common-env
+      GPT_DATA_PATH: /app/data/input.txt
+      GPT_MODEL_PATH: /models/best_model.bin
+    command: [ "data/input.txt" ] # train mode (no --chat flag)
+    networks:
+      - quadtrix-net
+    profiles:
+      - train
+
+  train-torch:
+    build:
+      context: .
+      dockerfile: .devops/Dockerfile.backend
+    image: quadtrix/backend:latest
+    container_name: quadtrix-train-torch
+    restart: "no"
+    volumes:
+      - models:/models
+      - ./engine:/app/engine
+      - ./data:/app/data:ro
+    environment:
+      <<: *common-env
+      QUADTRIX_TRAIN_DATA: /app/data/input.txt
+    working_dir: /app
+    command: [ "python", "engine/main.py" ]
+    networks:
+      - quadtrix-net
+    profiles:
+      - train
+
+  benchmark:
+    build:
+      context: .
+      dockerfile: .devops/Dockerfile.cpp
+    image: quadtrix/cpp:latest
+    container_name: quadtrix-benchmark
+    restart: "no"
+    volumes:
+      - models:/models
+      - ./data:/app/data:ro
+      - ./benchmark_results.csv:/app/benchmark_results.csv
+    environment:
+      <<: *common-env
+      GPT_MODEL_PATH: /models/best_model.bin
+
+    command: [ "data/input.txt", "--generate" ]
+    networks:
+      - quadtrix-net
+    profiles:
+      - benchmark
+
+volumes:
+  models:
+    driver: local
+  redis-data:
+    driver: local
+
+networks:
+  quadtrix-net:
+    driver: bridge
diff --git a/frontend/src/components/chat/EmptyState.tsx b/frontend/src/components/chat/EmptyState.tsx
index ce75d9a..abf94ec 100644
--- a/frontend/src/components/chat/EmptyState.tsx
+++ b/frontend/src/components/chat/EmptyState.tsx
@@ -1,13 +1,95 @@
 export function EmptyState() {
   return (
-    <div className="flex flex-1 items-center justify-center px-6">
-      <div className="flex w-full max-w-3xl flex-col items-center gap-6 text-center">
-        <div className="flex h-16 w-16 items-center justify-center rounded-md border border-[var(--border-muted)] bg-white">
-          <img alt="Quadtrix.cpp icon" className="h-14 w-14 object-contain" src="/icon.svg" />
+    <div
+      style={{
+        flex: 1,
+        display: "flex",
+        alignItems: "center",
+        justifyContent: "center",
+        padding: "24px",
+      }}
+    >
+      <div
+        style={{
+          display: "flex",
+          flexDirection: "column",
+          alignItems: "center",
+          gap: 20,
+          textAlign: "center",
+          maxWidth: 420,
+        }}
+      >
+        {/* Icon */}
+        <div
+          style={{
+            width: 56,
+            height: 56,
+            borderRadius: 14,
+            background: "linear-gradient(135deg, #4f8ef7 0%, #2563eb 100%)",
+            display: "flex",
+            alignItems: "center",
+            justifyContent: "center",
+            boxShadow: "0 8px 32px rgba(79,142,247,0.25)",
+          }}
+        >
+          <svg width="28" height="28" viewBox="0 0 24 24" fill="none">
+            <path
+              d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm-1 14H9V8h2v8zm4 0h-2V8h2v8z"
+              fill="white"
+              opacity="0.9"
+            />
+            <path
+              d="M8 12l2-2 2 2 4-4"
+              stroke="white"
+              strokeWidth="1.5"
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              fill="none"
+            />
+          </svg>
         </div>
-        <div className="space-y-2">
-          <h1 className="font-mono text-2xl font-semibold tracking-[0.18em] text-[var(--text-primary)]">Quadtrix.cpp</h1>
-          <p className="text-sm text-[var(--text-secondary)]">Minimal local chat interface. Start typing below to begin.</p>
+
+        <div>
+          <h1
+            style={{
+              margin: 0,
+              fontSize: 20,
+              fontWeight: 600,
+              color: "var(--text-primary)",
+              letterSpacing: "-0.3px",
+            }}
+          >
+            Quadtrix.cpp
+          </h1>
+          <p
+            style={{
+              margin: "8px 0 0",
+              fontSize: 13,
+              color: "var(--text-muted)",
+              lineHeight: 1.6,
+            }}
+          >
+            Local char-level language model. Start a conversation below.
+          </p>
+        </div>
+
+        {/* Hint chips */}
+        <div style={{ display: "flex", flexWrap: "wrap", gap: 8, justifyContent: "center" }}>
+          {["Fast local inference", "C++ & PyTorch backends", "No cloud required"].map((chip) => (
+            <span
+              key={chip}
+              style={{
+                padding: "4px 10px",
+                borderRadius: 20,
+                border: "1px solid var(--border-muted)",
+                fontSize: 11,
+                color: "var(--text-muted)",
+                background: "var(--bg-elevated)",
+              }}
+            >
+              {chip}
+            </span>
+          ))}
         </div>
       </div>
     </div>
diff --git a/frontend/src/components/chat/MessageAvatar.tsx b/frontend/src/components/chat/MessageAvatar.tsx
index 25373d5..c606c9d 100644
--- a/frontend/src/components/chat/MessageAvatar.tsx
+++ b/frontend/src/components/chat/MessageAvatar.tsx
@@ -6,15 +6,48 @@ interface MessageAvatarProps {
 
 export function MessageAvatar({ role }: MessageAvatarProps) {
   const isUser = role === "user";
+
+  if (isUser) {
+    return (
+      <div
+        style={{
+          width: 30,
+          height: 30,
+          borderRadius: "50%",
+          background: "var(--bg-elevated)",
+          border: "1px solid var(--border-muted)",
+          display: "flex",
+          alignItems: "center",
+          justifyContent: "center",
+          fontSize: 11,
+          fontWeight: 600,
+          color: "var(--text-secondary)",
+          flexShrink: 0,
+        }}
+      >
+        U
+      </div>
+    );
+  }
+
   return (
     <div
-      className={`flex h-8 w-8 shrink-0 items-center justify-center rounded-md border text-xs font-semibold ${
-        isUser
-          ? "border-[var(--border-muted)] bg-elevated text-[var(--text-primary)]"
-          : "border-[var(--border-muted)] bg-surface font-mono text-[var(--text-primary)]"
-      }`}
+      style={{
+        width: 30,
+        height: 30,
+        borderRadius: "50%",
+        background: "linear-gradient(135deg, #4f8ef7 0%, #2563eb 100%)",
+        display: "flex",
+        alignItems: "center",
+        justifyContent: "center",
+        fontSize: 12,
+        fontWeight: 700,
+        color: "#fff",
+        flexShrink: 0,
+        boxShadow: "0 2px 8px rgba(79,142,247,0.3)",
+      }}
     >
-      {isUser ? "You" : "Q"}
+      Q
     </div>
   );
 }
diff --git a/frontend/src/components/chat/MessageList.tsx b/frontend/src/components/chat/MessageList.tsx
index e38a0af..5de6e62 100644
--- a/frontend/src/components/chat/MessageList.tsx
+++ b/frontend/src/components/chat/MessageList.tsx
@@ -1,5 +1,6 @@
-import { useAutoScroll } from "../../hooks/useAutoScroll";
+import { useRef } from "react";
 import type { Message } from "../../types";
+import { useAutoScroll } from "../../hooks/useAutoScroll";
 import { MessageRow } from "./MessageRow";
 
 interface MessageListProps {
@@ -7,13 +8,25 @@ interface MessageListProps {
 }
 
 export function MessageList({ messages }: MessageListProps) {
-  const scrollRef = useAutoScroll<HTMLDivElement>(messages.length);
+  const bottomRef = useRef<HTMLDivElement | null>(null);
+  useAutoScroll(bottomRef, messages);
+
   return (
-    <div className="flex-1 overflow-y-auto px-4 py-8 md:px-8 md:py-10" ref={scrollRef}>
-      <div className="mx-auto flex max-w-4xl flex-col gap-8">
+    <div
+      style={{
+        flex: 1,
+        overflowY: "auto",
+        padding: "24px 16px",
+        display: "flex",
+        flexDirection: "column",
+        gap: 20,
+      }}
+    >
+      <div style={{ maxWidth: 780, width: "100%", margin: "0 auto", display: "flex", flexDirection: "column", gap: 20 }}>
         {messages.map((message) => (
           <MessageRow key={message.id} message={message} />
         ))}
+        <div ref={bottomRef} />
       </div>
     </div>
   );
diff --git a/frontend/src/components/chat/MessageRow.tsx b/frontend/src/components/chat/MessageRow.tsx
index 372d585..8dd3910 100644
--- a/frontend/src/components/chat/MessageRow.tsx
+++ b/frontend/src/components/chat/MessageRow.tsx
@@ -27,37 +27,96 @@ export function MessageRow({ message }: MessageRowProps) {
   return (
     <motion.div
       animate={{ opacity: 1, y: 0 }}
-      className={`group flex w-full gap-3 ${isUser ? "justify-end" : "justify-start"}`}
       initial={{ opacity: 0, y: 6 }}
-      transition={{ duration: 0.2 }}
+      transition={{ duration: 0.18 }}
+      className="group"
+      style={{
+        display: "flex",
+        width: "100%",
+        gap: 12,
+        justifyContent: isUser ? "flex-end" : "flex-start",
+        alignItems: "flex-start",
+      }}
     >
       {!isUser && <MessageAvatar role={message.role} />}
-      <div className={`max-w-[min(760px,calc(100vw-48px))] ${isUser ? "items-end" : "items-start"} flex flex-col gap-1`}>
-        <div className="flex items-center gap-2 font-mono text-[11px] uppercase tracking-[0.16em] text-[var(--text-muted)]">
-          <span>{isUser ? "You" : "Quadtrix"}</span>
+
+      <div
+        style={{
+          maxWidth: "min(680px, calc(100vw - 80px))",
+          display: "flex",
+          flexDirection: "column",
+          gap: 4,
+          alignItems: isUser ? "flex-end" : "flex-start",
+        }}
+      >
+        {/* Meta row */}
+        <div
+          style={{
+            display: "flex",
+            alignItems: "center",
+            gap: 8,
+            fontSize: 11,
+            color: "var(--text-muted)",
+          }}
+        >
+          <span style={{ fontWeight: 500 }}>{isUser ? "You" : "Quadtrix"}</span>
           <span>{formatRelativeTime(message.created_at)}</span>
           {!isUser && !message.pending && (
             <button
-              className="hidden rounded px-1 text-[var(--text-secondary)] hover:text-[var(--text-primary)] group-hover:inline"
+              className="group-hover:opacity-100"
               onClick={copyText}
               type="button"
+              style={{
+                opacity: 0,
+                background: "none",
+                border: "none",
+                cursor: "pointer",
+                color: copied ? "var(--status-online)" : "var(--text-muted)",
+                fontSize: 11,
+                padding: "0 2px",
+                transition: "opacity 0.12s, color 0.12s",
+              }}
             >
-              {copied ? "Copied" : "Copy"}
+              {copied ? "✓ Copied" : "Copy"}
             </button>
           )}
         </div>
+
+        {/* Bubble */}
         <div
-          className={`rounded-lg border px-4 py-3 text-sm leading-7 ${
-            isUser
-              ? "border-[var(--border-muted)] bg-surface text-[var(--text-primary)]"
+          style={{
+            borderRadius: 10,
+            padding: "10px 14px",
+            fontSize: 13,
+            lineHeight: 1.7,
+            ...(isUser
+              ? {
+                  background: "var(--bg-elevated)",
+                  border: "1px solid var(--border-muted)",
+                  color: "var(--text-primary)",
+                }
               : message.error
-                ? "border-red-500/20 bg-red-500/10 font-sans text-red-200"
-                : "border-[var(--border-subtle)] bg-[#0d0d0d] font-mono text-[var(--text-primary)]"
-          }`}
+              ? {
+                  background: "rgba(224,82,82,0.08)",
+                  border: "1px solid rgba(224,82,82,0.2)",
+                  color: "#f87171",
+                }
+              : {
+                  background: "var(--bg-surface)",
+                  border: "1px solid var(--border-subtle)",
+                  color: "var(--text-primary)",
+                  fontFamily: "var(--font-mono)",
+                }),
+          }}
         >
-          {message.pending ? <ThinkingIndicator /> : <span className="whitespace-pre-wrap">{message.text}</span>}
+          {message.pending ? (
+            <ThinkingIndicator />
+          ) : (
+            <span style={{ whiteSpace: "pre-wrap" }}>{message.text}</span>
+          )}
         </div>
       </div>
+
       {isUser && <MessageAvatar role={message.role} />}
     </motion.div>
   );
diff --git a/frontend/src/components/chat/ThinkingIndicator.tsx b/frontend/src/components/chat/ThinkingIndicator.tsx
index e83d0f5..7ec4a6c 100644
--- a/frontend/src/components/chat/ThinkingIndicator.tsx
+++ b/frontend/src/components/chat/ThinkingIndicator.tsx
@@ -1,12 +1,28 @@
 export function ThinkingIndicator() {
   return (
-    <div className="flex items-center gap-2 text-[var(--text-secondary)]">
-      <span>Quadtrix is thinking</span>
-      <span className="flex gap-1">
-        <span className="h-1.5 w-1.5 animate-bounce rounded-full bg-[var(--text-secondary)]" />
-        <span className="h-1.5 w-1.5 animate-bounce rounded-full bg-[var(--text-secondary)] [animation-delay:120ms]" />
-        <span className="h-1.5 w-1.5 animate-bounce rounded-full bg-[var(--text-secondary)] [animation-delay:240ms]" />
+    <div style={{ display: "flex", alignItems: "center", gap: 8, color: "var(--text-muted)" }}>
+      <span style={{ fontSize: 12 }}>Generating</span>
+      <span style={{ display: "flex", gap: 3 }}>
+        {[0, 120, 240].map((delay) => (
+          <span
+            key={delay}
+            style={{
+              display: "inline-block",
+              width: 5,
+              height: 5,
+              borderRadius: "50%",
+              background: "var(--accent)",
+              animation: `bounce 1s ease-in-out ${delay}ms infinite`,
+            }}
+          />
+        ))}
       </span>
+      <style>{`
+        @keyframes bounce {
+          0%, 80%, 100% { transform: translateY(0); opacity: 0.4; }
+          40% { transform: translateY(-4px); opacity: 1; }
+        }
+      `}</style>
     </div>
   );
 }
diff --git a/include/tensor.h b/include/tensor.h
index f6ac4a5..c3526b6 100644
--- a/include/tensor.h
+++ b/include/tensor.h
@@ -1,8 +1,4 @@
 #pragma once
-// ============================================================
-//  include/tensor.h  –  Lightweight 2-D / 3-D float tensor
-//  (CPU only – mirrors what PyTorch tensors do in the model)
-// ============================================================
 
 #include <vector>
 #include <cmath>
@@ -15,310 +11,557 @@
 #include <iostream>
 #include <functional>
 
-// ------------------------------------------------------------------
-// Tensor  (row-major, float32)
-//   shape is stored as {d0, d1}  or  {d0, d1, d2}
-// ------------------------------------------------------------------
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
 struct Tensor
 {
-      std::vector<int> shape;
-      std::vector<float> data;
-
-      Tensor() = default;
-
-      Tensor(std::vector<int> sh, float fill = 0.0f)
-          : shape(sh)
-      {
-            int total = 1;
-            for (int d : sh)
-                  total *= d;
-            data.assign(total, fill);
-      }
-
-      int numel() const
-      {
-            int n = 1;
-            for (int d : shape)
-                  n *= d;
-            return n;
-      }
-
-      int ndim() const { return (int)shape.size(); }
-
-      // ---- element access helpers --------------------------------
-      float &at(int i)
-      {
-            assert(i >= 0 && i < (int)data.size());
-            return data[i];
-      }
-      float at(int i) const
-      {
-            assert(i >= 0 && i < (int)data.size());
-            return data[i];
-      }
-
-      // 2-D
-      float &at(int r, int c)
-      {
-            return data[r * shape[1] + c];
-      }
-      float at(int r, int c) const
-      {
-            return data[r * shape[1] + c];
-      }
-
-      // 3-D
-      float &at(int b, int r, int c)
-      {
-            return data[b * shape[1] * shape[2] + r * shape[2] + c];
-      }
-      float at(int b, int r, int c) const
-      {
-            return data[b * shape[1] * shape[2] + r * shape[2] + c];
-      }
-
-      // ---- factory helpers ---------------------------------------
-      static Tensor zeros(std::vector<int> sh) { return Tensor(sh, 0.0f); }
-      static Tensor ones(std::vector<int> sh) { return Tensor(sh, 1.0f); }
-
-      static Tensor randn(std::vector<int> sh, float mean, float std,
-                          std::mt19937 &rng)
-      {
-            std::normal_distribution<float> dist(mean, std);
-            Tensor t(sh);
-            for (auto &v : t.data)
-                  v = dist(rng);
-            return t;
-      }
-
-      void fill(float v) { std::fill(data.begin(), data.end(), v); }
-
-      // ---- print shape -------------------------------------------
-      void print_shape(const std::string &name = "") const
-      {
-            if (!name.empty())
-                  std::cout << name << ": ";
-            std::cout << "[";
-            for (int i = 0; i < (int)shape.size(); ++i)
-            {
-                  std::cout << shape[i];
-                  if (i + 1 < (int)shape.size())
-                        std::cout << ", ";
-            }
-            std::cout << "]" << std::endl;
-      }
-};
+    std::vector<int> shape;
+    std::vector<float> data;
+
+    Tensor() = default;
+
+    Tensor(std::vector<int> sh, float fill = 0.0f) : shape(std::move(sh))
+    {
+        int total = 1;
+        for (int d : shape)
+            total *= d;
+        data.reserve(total);
+        data.assign(total, fill);
+    }
+
+    Tensor(const Tensor &) = default;
+    Tensor(Tensor &&) noexcept = default;
+    Tensor &operator=(const Tensor &) = default;
+    Tensor &operator=(Tensor &&) noexcept = default;
+
+    int numel() const
+    {
+        int n = 1;
+        for (int d : shape)
+            n *= d;
+        return n;
+    }
 
-// ------------------------------------------------------------------
-// Basic math ops  (in-place and returning new tensors)
-// ------------------------------------------------------------------
+    int ndim() const { return (int)shape.size(); }
+
+    float &at(int i) { return data[i]; }
+    float at(int i) const { return data[i]; }
+
+    float &at(int r, int c) { return data[r * shape[1] + c]; }
+    float at(int r, int c) const { return data[r * shape[1] + c]; }
+
+    float &at(int b, int r, int c) { return data[b * shape[1] * shape[2] + r * shape[2] + c]; }
+    float at(int b, int r, int c) const { return data[b * shape[1] * shape[2] + r * shape[2] + c]; }
+
+    static Tensor zeros(std::vector<int> sh) { return Tensor(sh, 0.0f); }
+    static Tensor ones(std::vector<int> sh) { return Tensor(sh, 1.0f); }
+
+    static Tensor randn(std::vector<int> sh, float mean, float std, std::mt19937 &rng)
+    {
+        std::normal_distribution<float> dist(mean, std);
+        Tensor t(sh);
+        for (auto &v : t.data)
+            v = dist(rng);
+        return t;
+    }
+
+    void fill(float v) { std::fill(data.begin(), data.end(), v); }
+
+    void print_shape(const std::string &name = "") const
+    {
+        if (!name.empty())
+            std::cout << name << ": ";
+        std::cout << "[";
+        for (int i = 0; i < (int)shape.size(); ++i)
+        {
+            std::cout << shape[i];
+            if (i + 1 < (int)shape.size())
+                std::cout << ", ";
+        }
+        std::cout << "]" << std::endl;
+    }
+};
 
-// element-wise add (same shape)
 inline Tensor add(const Tensor &a, const Tensor &b)
 {
-      assert(a.data.size() == b.data.size());
-      Tensor c(a.shape);
-      for (int i = 0; i < (int)a.data.size(); ++i)
-            c.data[i] = a.data[i] + b.data[i];
-      return c;
+    assert(a.data.size() == b.data.size());
+    Tensor c(a.shape);
+    size_t n = a.data.size();
+
+#ifdef __AVX__
+    size_t i = 0;
+    size_t vec_end = n & ~7ULL;
+    for (; i < vec_end; i += 8)
+    {
+        __m256 va = _mm256_loadu_ps(&a.data[i]);
+        __m256 vb = _mm256_loadu_ps(&b.data[i]);
+        __m256 vc = _mm256_add_ps(va, vb);
+        _mm256_storeu_ps(&c.data[i], vc);
+    }
+    for (; i < n; ++i)
+        c.data[i] = a.data[i] + b.data[i];
+#elif defined(__SSE__)
+    size_t i = 0;
+    size_t vec_end = n & ~3ULL;
+    for (; i < vec_end; i += 4)
+    {
+        __m128 va = _mm_loadu_ps(&a.data[i]);
+        __m128 vb = _mm_loadu_ps(&b.data[i]);
+        __m128 vc = _mm_add_ps(va, vb);
+        _mm_storeu_ps(&c.data[i], vc);
+    }
+    for (; i < n; ++i)
+        c.data[i] = a.data[i] + b.data[i];
+#else
+    for (size_t i = 0; i < n; ++i)
+        c.data[i] = a.data[i] + b.data[i];
+#endif
+    return c;
+}
+
+inline void add_inplace(Tensor &a, const Tensor &b)
+{
+    assert(a.data.size() == b.data.size());
+    size_t n = a.data.size();
+
+#ifdef __AVX__
+    size_t i = 0;
+    size_t vec_end = n & ~7ULL;
+    for (; i < vec_end; i += 8)
+    {
+        __m256 va = _mm256_loadu_ps(&a.data[i]);
+        __m256 vb = _mm256_loadu_ps(&b.data[i]);
+        __m256 vc = _mm256_add_ps(va, vb);
+        _mm256_storeu_ps(&a.data[i], vc);
+    }
+    for (; i < n; ++i)
+        a.data[i] += b.data[i];
+#elif defined(__SSE__)
+    size_t i = 0;
+    size_t vec_end = n & ~3ULL;
+    for (; i < vec_end; i += 4)
+    {
+        __m128 va = _mm_loadu_ps(&a.data[i]);
+        __m128 vb = _mm_loadu_ps(&b.data[i]);
+        __m128 vc = _mm_add_ps(va, vb);
+        _mm_storeu_ps(&a.data[i], vc);
+    }
+    for (; i < n; ++i)
+        a.data[i] += b.data[i];
+#else
+    for (size_t i = 0; i < n; ++i)
+        a.data[i] += b.data[i];
+#endif
 }
 
-// scalar multiply
 inline Tensor scale(const Tensor &a, float s)
 {
-      Tensor c(a.shape);
-      for (int i = 0; i < (int)a.data.size(); ++i)
-            c.data[i] = a.data[i] * s;
-      return c;
+    Tensor c(a.shape);
+    size_t n = a.data.size();
+
+#ifdef __AVX__
+    size_t i = 0;
+    size_t vec_end = n & ~7ULL;
+    __m256 vs = _mm256_set1_ps(s);
+    for (; i < vec_end; i += 8)
+    {
+        __m256 va = _mm256_loadu_ps(&a.data[i]);
+        __m256 vc = _mm256_mul_ps(va, vs);
+        _mm256_storeu_ps(&c.data[i], vc);
+    }
+    for (; i < n; ++i)
+        c.data[i] = a.data[i] * s;
+#elif defined(__SSE__)
+    size_t i = 0;
+    size_t vec_end = n & ~3ULL;
+    __m128 vs = _mm_set1_ps(s);
+    for (; i < vec_end; i += 4)
+    {
+        __m128 va = _mm_loadu_ps(&a.data[i]);
+        __m128 vc = _mm_mul_ps(va, vs);
+        _mm_storeu_ps(&c.data[i], vc);
+    }
+    for (; i < n; ++i)
+        c.data[i] = a.data[i] * s;
+#else
+    for (size_t i = 0; i < n; ++i)
+        c.data[i] = a.data[i] * s;
+#endif
+    return c;
+}
+
+inline void scale_inplace(Tensor &a, float s)
+{
+    size_t n = a.data.size();
+
+#ifdef __AVX__
+    size_t i = 0;
+    size_t vec_end = n & ~7ULL;
+    __m256 vs = _mm256_set1_ps(s);
+    for (; i < vec_end; i += 8)
+    {
+        __m256 va = _mm256_loadu_ps(&a.data[i]);
+        __m256 vc = _mm256_mul_ps(va, vs);
+        _mm256_storeu_ps(&a.data[i], vc);
+    }
+    for (; i < n; ++i)
+        a.data[i] *= s;
+#elif defined(__SSE__)
+    size_t i = 0;
+    size_t vec_end = n & ~3ULL;
+    __m128 vs = _mm_set1_ps(s);
+    for (; i < vec_end; i += 4)
+    {
+        __m128 va = _mm_loadu_ps(&a.data[i]);
+        __m128 vc = _mm_mul_ps(va, vs);
+        _mm_storeu_ps(&a.data[i], vc);
+    }
+    for (; i < n; ++i)
+        a.data[i] *= s;
+#else
+    for (auto &v : a.data)
+        v *= s;
+#endif
 }
 
-// ReLU
 inline Tensor relu(const Tensor &a)
 {
-      Tensor c(a.shape);
-      for (int i = 0; i < (int)a.data.size(); ++i)
-            c.data[i] = std::max(0.0f, a.data[i]);
-      return c;
+    Tensor c(a.shape);
+    size_t n = a.data.size();
+
+#ifdef __AVX__
+    size_t i = 0;
+    size_t vec_end = n & ~7ULL;
+    __m256 zero = _mm256_setzero_ps();
+    for (; i < vec_end; i += 8)
+    {
+        __m256 va = _mm256_loadu_ps(&a.data[i]);
+        __m256 vc = _mm256_max_ps(va, zero);
+        _mm256_storeu_ps(&c.data[i], vc);
+    }
+    for (; i < n; ++i)
+        c.data[i] = std::max(0.0f, a.data[i]);
+#elif defined(__SSE__)
+    size_t i = 0;
+    size_t vec_end = n & ~3ULL;
+    __m128 zero = _mm_setzero_ps();
+    for (; i < vec_end; i += 4)
+    {
+        __m128 va = _mm_loadu_ps(&a.data[i]);
+        __m128 vc = _mm_max_ps(va, zero);
+        _mm_storeu_ps(&c.data[i], vc);
+    }
+    for (; i < n; ++i)
+        c.data[i] = std::max(0.0f, a.data[i]);
+#else
+    for (size_t i = 0; i < n; ++i)
+        c.data[i] = std::max(0.0f, a.data[i]);
+#endif
+    return c;
 }
 
-// Softmax along last dim for 3-D tensor [B, T, C]
-inline Tensor softmax3d(const Tensor &a)
+inline void relu_inplace(Tensor &a)
 {
-      int B = a.shape[0], T = a.shape[1], C = a.shape[2];
-      Tensor out(a.shape);
-      for (int b = 0; b < B; ++b)
-      {
-            for (int t = 0; t < T; ++t)
-            {
-                  float maxv = -1e30f;
-                  for (int c = 0; c < C; ++c)
-                        maxv = std::max(maxv, a.at(b, t, c));
-                  float sumv = 0.0f;
-                  for (int c = 0; c < C; ++c)
-                  {
-                        float e = std::exp(a.at(b, t, c) - maxv);
-                        out.at(b, t, c) = e;
-                        sumv += e;
-                  }
-                  for (int c = 0; c < C; ++c)
-                        out.at(b, t, c) /= sumv;
-            }
-      }
-      return out;
+    size_t n = a.data.size();
+
+#ifdef __AVX__
+    size_t i = 0;
+    size_t vec_end = n & ~7ULL;
+    __m256 zero = _mm256_setzero_ps();
+    for (; i < vec_end; i += 8)
+    {
+        __m256 va = _mm256_loadu_ps(&a.data[i]);
+        __m256 vc = _mm256_max_ps(va, zero);
+        _mm256_storeu_ps(&a.data[i], vc);
+    }
+    for (; i < n; ++i)
+        a.data[i] = std::max(0.0f, a.data[i]);
+#elif defined(__SSE__)
+    size_t i = 0;
+    size_t vec_end = n & ~3ULL;
+    __m128 zero = _mm_setzero_ps();
+    for (; i < vec_end; i += 4)
+    {
+        __m128 va = _mm_loadu_ps(&a.data[i]);
+        __m128 vc = _mm_max_ps(va, zero);
+        _mm_storeu_ps(&a.data[i], vc);
+    }
+    for (; i < n; ++i)
+        a.data[i] = std::max(0.0f, a.data[i]);
+#else
+    for (auto &v : a.data)
+        v = std::max(0.0f, v);
+#endif
 }
 
-// Softmax along last dim for 2-D tensor [T, C]
-inline Tensor softmax2d(const Tensor &a)
+inline Tensor softmax3d(const Tensor &a)
 {
-      int T = a.shape[0], C = a.shape[1];
-      Tensor out(a.shape);
-      for (int t = 0; t < T; ++t)
-      {
+    int B = a.shape[0], T = a.shape[1], C = a.shape[2];
+    Tensor out(a.shape);
+
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) if (B * T > 64)
+#endif
+    for (int b = 0; b < B; ++b)
+    {
+        for (int t = 0; t < T; ++t)
+        {
             float maxv = -1e30f;
             for (int c = 0; c < C; ++c)
-                  maxv = std::max(maxv, a.at(t, c));
+                maxv = std::max(maxv, a.at(b, t, c));
+
             float sumv = 0.0f;
             for (int c = 0; c < C; ++c)
             {
-                  float e = std::exp(a.at(t, c) - maxv);
-                  out.at(t, c) = e;
-                  sumv += e;
+                float e = std::exp(a.at(b, t, c) - maxv);
+                out.at(b, t, c) = e;
+                sumv += e;
             }
+
+            float inv_sum = 1.0f / sumv;
             for (int c = 0; c < C; ++c)
-                  out.at(t, c) /= sumv;
-      }
-      return out;
+                out.at(b, t, c) *= inv_sum;
+        }
+    }
+    return out;
 }
 
-// Layer-norm along last dim  [B, T, C]  → same shape
-inline Tensor layer_norm(const Tensor &x,
-                         const Tensor &gamma, // [C]
-                         const Tensor &beta,  // [C]
-                         float eps = 1e-5f)
+inline Tensor softmax2d(const Tensor &a)
 {
-      int B = x.shape[0], T = x.shape[1], C = x.shape[2];
-      Tensor out(x.shape);
-      for (int b = 0; b < B; ++b)
-      {
-            for (int t = 0; t < T; ++t)
+    int T = a.shape[0], C = a.shape[1];
+    Tensor out(a.shape);
+
+#ifdef _OPENMP
+#pragma omp parallel for if (T > 128)
+#endif
+    for (int t = 0; t < T; ++t)
+    {
+        float maxv = -1e30f;
+        for (int c = 0; c < C; ++c)
+            maxv = std::max(maxv, a.at(t, c));
+
+        float sumv = 0.0f;
+        for (int c = 0; c < C; ++c)
+        {
+            float e = std::exp(a.at(t, c) - maxv);
+            out.at(t, c) = e;
+            sumv += e;
+        }
+
+        float inv_sum = 1.0f / sumv;
+        for (int c = 0; c < C; ++c)
+            out.at(t, c) *= inv_sum;
+    }
+    return out;
+}
+
+inline Tensor layer_norm(const Tensor &x, const Tensor &gamma, const Tensor &beta, float eps = 1e-5f)
+{
+    int B = x.shape[0], T = x.shape[1], C = x.shape[2];
+    Tensor out(x.shape);
+
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) if (B * T > 64)
+#endif
+    for (int b = 0; b < B; ++b)
+    {
+        for (int t = 0; t < T; ++t)
+        {
+            float mu = 0.0f;
+            for (int c = 0; c < C; ++c)
+                mu += x.at(b, t, c);
+            mu /= C;
+
+            float var = 0.0f;
+            for (int c = 0; c < C; ++c)
             {
-                  float mu = 0.0f;
-                  for (int c = 0; c < C; ++c)
-                        mu += x.at(b, t, c);
-                  mu /= C;
-                  float var = 0.0f;
-                  for (int c = 0; c < C; ++c)
-                  {
-                        float d = x.at(b, t, c) - mu;
-                        var += d * d;
-                  }
-                  var /= C;
-                  float inv = 1.0f / std::sqrt(var + eps);
-                  for (int c = 0; c < C; ++c)
-                        out.at(b, t, c) = (x.at(b, t, c) - mu) * inv * gamma.at(c) + beta.at(c);
+                float d = x.at(b, t, c) - mu;
+                var += d * d;
             }
-      }
-      return out;
+            var /= C;
+
+            float inv = 1.0f / std::sqrt(var + eps);
+            for (int c = 0; c < C; ++c)
+                out.at(b, t, c) = (x.at(b, t, c) - mu) * inv * gamma.at(c) + beta.at(c);
+        }
+    }
+    return out;
 }
 
-// matmul:  [B, T, D] x [D, E]  →  [B, T, E]
 inline Tensor matmul(const Tensor &a, const Tensor &w)
 {
-      // a: [B, T, D]  or  [B, T, D]
-      // w: [D, E]
-      assert(a.ndim() == 3 && w.ndim() == 2);
-      int B = a.shape[0], T = a.shape[1], D = a.shape[2];
-      int E = w.shape[1];
-      assert(w.shape[0] == D);
-      Tensor out({B, T, E}, 0.0f);
-      for (int b = 0; b < B; ++b)
-            for (int t = 0; t < T; ++t)
-                  for (int e = 0; e < E; ++e)
-                  {
-                        float s = 0.0f;
-                        for (int d = 0; d < D; ++d)
-                              s += a.at(b, t, d) * w.at(d, e);
-                        out.at(b, t, e) = s;
-                  }
-      return out;
+    assert(a.ndim() == 3 && w.ndim() == 2);
+    int B = a.shape[0], T = a.shape[1], D = a.shape[2];
+    int E = w.shape[1];
+    assert(w.shape[0] == D);
+
+    Tensor out({B, T, E}, 0.0f);
+
+    const int TILE_T = 32;
+    const int TILE_E = 32;
+    const int TILE_D = 32;
+
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) if (B * T * E * D > 100000)
+#endif
+    for (int b = 0; b < B; ++b)
+    {
+        for (int t0 = 0; t0 < T; t0 += TILE_T)
+        {
+            int t_end = std::min(t0 + TILE_T, T);
+            for (int e0 = 0; e0 < E; e0 += TILE_E)
+            {
+                int e_end = std::min(e0 + TILE_E, E);
+                for (int d0 = 0; d0 < D; d0 += TILE_D)
+                {
+                    int d_end = std::min(d0 + TILE_D, D);
+                    for (int t = t0; t < t_end; ++t)
+                    {
+                        for (int e = e0; e < e_end; ++e)
+                        {
+                            float s = out.at(b, t, e);
+                            for (int d = d0; d < d_end; ++d)
+                                s += a.at(b, t, d) * w.at(d, e);
+                            out.at(b, t, e) = s;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return out;
 }
 
-// add bias [E] broadcast over [B, T, E]
 inline Tensor add_bias(const Tensor &x, const Tensor &bias)
 {
-      assert(x.shape.back() == bias.shape[0]);
-      Tensor out = x;
-      int E = bias.shape[0];
-      int stride = E;
-      int n = x.numel() / E;
-      for (int i = 0; i < n; ++i)
-            for (int e = 0; e < E; ++e)
-                  out.data[i * stride + e] += bias.data[e];
-      return out;
+    assert(x.shape.back() == bias.shape[0]);
+    Tensor out = x;
+    int E = bias.shape[0];
+    int stride = E;
+    int n = x.numel() / E;
+
+#ifdef _OPENMP
+#pragma omp parallel for if (n * E > 10000)
+#endif
+    for (int i = 0; i < n; ++i)
+    {
+        for (int e = 0; e < E; ++e)
+            out.data[i * stride + e] += bias.data[e];
+    }
+    return out;
 }
 
-// batched matmul:  [B, T, D] x [B, D, T2]  →  [B, T, T2]
 inline Tensor bmm(const Tensor &a, const Tensor &b)
 {
-      assert(a.ndim() == 3 && b.ndim() == 3);
-      int B = a.shape[0], T = a.shape[1], D = a.shape[2];
-      int T2 = b.shape[2];
-      assert(b.shape[0] == B && b.shape[1] == D);
-      Tensor out({B, T, T2}, 0.0f);
-      for (int bb = 0; bb < B; ++bb)
-            for (int t = 0; t < T; ++t)
-                  for (int t2 = 0; t2 < T2; ++t2)
-                  {
-                        float s = 0.0f;
-                        for (int d = 0; d < D; ++d)
-                              s += a.at(bb, t, d) * b.at(bb, d, t2);
-                        out.at(bb, t, t2) = s;
-                  }
-      return out;
+    assert(a.ndim() == 3 && b.ndim() == 3);
+    int B = a.shape[0], T = a.shape[1], D = a.shape[2];
+    int T2 = b.shape[2];
+    assert(b.shape[0] == B && b.shape[1] == D);
+
+    Tensor out({B, T, T2}, 0.0f);
+
+    const int TILE = 32;
+
+#ifdef _OPENMP
+#pragma omp parallel for if (B * T * T2 * D > 100000)
+#endif
+    for (int bb = 0; bb < B; ++bb)
+    {
+        for (int t0 = 0; t0 < T; t0 += TILE)
+        {
+            int t_end = std::min(t0 + TILE, T);
+            for (int t2_0 = 0; t2_0 < T2; t2_0 += TILE)
+            {
+                int t2_end = std::min(t2_0 + TILE, T2);
+                for (int d0 = 0; d0 < D; d0 += TILE)
+                {
+                    int d_end = std::min(d0 + TILE, D);
+                    for (int t = t0; t < t_end; ++t)
+                    {
+                        for (int t2 = t2_0; t2 < t2_end; ++t2)
+                        {
+                            float s = out.at(bb, t, t2);
+                            for (int d = d0; d < d_end; ++d)
+                                s += a.at(bb, t, d) * b.at(bb, d, t2);
+                            out.at(bb, t, t2) = s;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return out;
 }
 
-// transpose last two dims of 3-D tensor [B, T, D] → [B, D, T]
 inline Tensor transpose23(const Tensor &a)
 {
-      int B = a.shape[0], T = a.shape[1], D = a.shape[2];
-      Tensor out({B, D, T});
-      for (int b = 0; b < B; ++b)
+    int B = a.shape[0], T = a.shape[1], D = a.shape[2];
+    Tensor out({B, D, T});
+
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) if (B * T * D > 10000)
+#endif
+    for (int b = 0; b < B; ++b)
+    {
+        for (int d = 0; d < D; ++d)
+        {
             for (int t = 0; t < T; ++t)
-                  for (int d = 0; d < D; ++d)
-                        out.at(b, d, t) = a.at(b, t, d);
-      return out;
+                out.at(b, d, t) = a.at(b, t, d);
+        }
+    }
+    return out;
 }
 
-// concat along last dim:  [B,T,D1] + [B,T,D2] → [B,T,D1+D2]
 inline Tensor cat_last(const std::vector<Tensor> &ts)
 {
-      int B = ts[0].shape[0], T = ts[0].shape[1];
-      int total = 0;
-      for (auto &t : ts)
-            total += t.shape[2];
-      Tensor out({B, T, total}, 0.0f);
-      int offset = 0;
-      for (auto &t : ts)
-      {
-            int D = t.shape[2];
-            for (int b = 0; b < B; ++b)
-                  for (int tt = 0; tt < T; ++tt)
-                        for (int d = 0; d < D; ++d)
-                              out.at(b, tt, offset + d) = t.at(b, tt, d);
-            offset += D;
-      }
-      return out;
+    int B = ts[0].shape[0], T = ts[0].shape[1];
+    int total = 0;
+    for (auto &t : ts)
+        total += t.shape[2];
+
+    Tensor out({B, T, total}, 0.0f);
+
+    int offset = 0;
+    for (auto &t : ts)
+    {
+        int D = t.shape[2];
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) if (B * T * D > 10000)
+#endif
+        for (int b = 0; b < B; ++b)
+        {
+            for (int tt = 0; tt < T; ++tt)
+            {
+                for (int d = 0; d < D; ++d)
+                    out.at(b, tt, offset + d) = t.at(b, tt, d);
+            }
+        }
+        offset += D;
+    }
+    return out;
 }
 
-// dropout mask (applied only during training)
 inline Tensor dropout(const Tensor &x, float p, bool training, std::mt19937 &rng)
 {
-      if (!training || p == 0.0f)
-            return x;
-      std::bernoulli_distribution dist(1.0f - p);
-      Tensor out = x;
-      float scale_v = 1.0f / (1.0f - p);
-      for (auto &v : out.data)
-            v = dist(rng) ? v * scale_v : 0.0f;
-      return out;
+    if (!training || p == 0.0f)
+        return x;
+
+    std::bernoulli_distribution dist(1.0f - p);
+    Tensor out = x;
+    float scale_v = 1.0f / (1.0f - p);
+
+    for (auto &v : out.data)
+        v = dist(rng) ? v * scale_v : 0.0f;
+
+    return out;
 }
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
index 006af20..7fc540c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -103,6 +103,22 @@ static std::string choose_output_path(const std::string &requested_path,
       return exe_relative;
 }
 
+// sample N tokens from the model and print them
+static void sample_tokens(GPTLanguageModel &model,
+                          DataLoader &dl,
+                          int n_tokens)
+{
+      std::vector<int> ctx = {0};
+      for (int i = 0; i < n_tokens; ++i)
+      {
+            ctx = model.generate(ctx, 1);
+            std::cout << dl.decode({ctx.back()}) << std::flush;
+            if ((int)ctx.size() > BLOCK_SIZE)
+                  ctx = std::vector<int>(ctx.end() - BLOCK_SIZE, ctx.end());
+      }
+      std::cout << "\n";
+}
+
 // estimate loss — no gradients, training=false
 static float estimate_loss(GPTLanguageModel &model,
                            DataLoader &dl,
@@ -184,10 +200,7 @@ int main(int argc, char *argv[])
       std::signal(SIGINT, sig_handler);
 
       // Banner
-      std::cout << std::string(60, '=') << "\n";
       std::cout << " Quadtrix v1.0 (C++)\n";
-      std::cout << std::string(60, '=') << "\n";
-      std::cout << "\n[INFO] Starting at: " << now_str() << "\n";
 
       std::string data_path = DEFAULT_CLEANED_PATH;
       const char *env_data_path = std::getenv(DATA_PATH_ENV_VAR.c_str());
@@ -219,17 +232,6 @@ int main(int argc, char *argv[])
       data_path = choose_existing_path(data_path, argv[0]);
       model_path = choose_output_path(model_path, argv[0]);
 
-      // Config print
-      std::cout << "\n[CONFIG] Hyperparameters:\n";
-      std::cout << "         batch_size=" << BATCH_SIZE
-                << "  block_size=" << BLOCK_SIZE << "\n";
-      std::cout << "         max_iters=" << MAX_ITERS
-                << "  learning_rate=" << LEARNING_RATE << "\n";
-      std::cout << "         n_embd=" << N_EMBD
-                << "  n_head=" << N_HEAD
-                << "  n_layer=" << N_LAYER
-                << "  dropout=" << DROPOUT << "\n";
-
       //  Data
       DataLoader dl;
       try
@@ -247,13 +249,12 @@ int main(int argc, char *argv[])
       GPTLanguageModel model(dl.vocab_size, N_EMBD, N_HEAD, N_LAYER, BLOCK_SIZE, SEED);
 
       long n_params = model.num_params();
-      std::cout << "[MODEL] Parameters  : "
-                << std::fixed << std::setprecision(2)
-                << n_params / 1.0e6f << " M  (" << n_params << " total)\n";
-      std::cout << "[MODEL] Architecture: "
-                << N_LAYER << " layers x "
-                << N_HEAD << " heads x "
-                << N_EMBD << " embedding dim\n";
+      std::cout << "max_seq_len: " << BLOCK_SIZE << "\n";
+      std::cout << "vocab_size: " << dl.vocab_size << "\n";
+      std::cout << "num_layers: " << N_LAYER << "\n";
+      std::cout << "num_heads: " << N_HEAD << "\n";
+      std::cout << "channels: " << N_EMBD << "\n";
+      std::cout << "num_parameters: " << n_params << "\n";
 
       // chat mode
       if (chat_mode)
@@ -268,9 +269,8 @@ int main(int argc, char *argv[])
             }
 
             model.load(model_path);
-            std::cout << "[CHAT]  Weights loaded from " << model_path << "\n";
-            std::cout << "[CHAT]  Max tokens per reply: " << chat_tokens
-                      << "  (override with --chat-tokens N)\n";
+            std::cout << "weights: " << model_path << "\n";
+            std::cout << "max_tokens: " << chat_tokens << "\n";
 
             run_chat(model, dl, chat_tokens);
             return 0;
@@ -289,10 +289,7 @@ int main(int argc, char *argv[])
             }
 
             model.load(model_path);
-            std::cout << "\n"
-                      << std::string(60, '-') << "\n";
-            std::cout << "  Quadtrix OUTPUT  (Ctrl+C to stop)\n";
-            std::cout << std::string(60, '-') << "\n\n";
+            std::cout << "\ngenerating:\n";
             std::vector<int> ctx = {0};
             while (!g_interrupted)
             {
@@ -301,7 +298,7 @@ int main(int argc, char *argv[])
                   if ((int)ctx.size() > BLOCK_SIZE)
                         ctx = std::vector<int>(ctx.end() - BLOCK_SIZE, ctx.end());
             }
-            std::cout << "\n\n[Stopped by user]\n";
+            std::cout << "\n";
             return 0;
       }
 
@@ -312,114 +309,78 @@ int main(int argc, char *argv[])
       std::mt19937 rng(SEED);
 
       // training loop
-      std::cout << "\n"
-                << std::string(60, '-') << "\n";
-      std::cout << "  TRAINING  ("
-                << MAX_ITERS << " iters, eval every "
-                << EVAL_INTERVAL << ")\n";
-      std::cout << std::string(60, '-') << "\n";
 
       float best_val_loss = 1e30f;
+      float last_val_loss = 0.0f;
       double train_start = wall_secs();
-      double last_eval_time = train_start; // ← tracks time of previous eval
 
-      for (int iter = 0; iter <= MAX_ITERS && !g_interrupted; ++iter)
+      // compute initial val loss before training
       {
+            std::mt19937 init_rng(SEED);
+            last_val_loss = estimate_loss(model, dl, "val", init_rng);
+      }
 
-            // Periodic eval checkpoint
-            if (iter % EVAL_INTERVAL == 0 || iter == MAX_ITERS)
-            {
-                  double now = wall_secs();
-                  double elapsed = now - train_start;
-
-                  // ms per training step since the last eval window
-                  double window_secs = now - last_eval_time;
-                  int steps_in_win = (iter == 0) ? 1 : EVAL_INTERVAL;
-                  double ms_per_step = window_secs * 1000.0 / steps_in_win;
-
-                  // tokens processed per second
-                  long toks_in_win = (long)BATCH_SIZE * BLOCK_SIZE * steps_in_win;
-                  int tok_per_sec = (window_secs > 0.0)
-                                        ? (int)(toks_in_win / window_secs)
-                                        : 0;
-
-                  last_eval_time = now; // reset window
-
-                  float tl = estimate_loss(model, dl, "train", rng);
-                  float vl = estimate_loss(model, dl, "val", rng);
-
-                  bool better = vl < best_val_loss;
-                  if (better)
-                  {
-                        best_val_loss = vl;
-                        model.save(model_path);
-                  }
-
-                  // ── new log line ─────────────────────────────────────────────
-                  std::cout
-                      << "step "
-                      << std::setw(5) << iter << "/" << MAX_ITERS
-                      << " | loss "
-                      << std::fixed << std::setprecision(6) << tl
-                      << " | val "
-                      << std::fixed << std::setprecision(6) << vl
-                      << " | lr "
-                      << std::scientific << std::setprecision(2) << (float)LEARNING_RATE
-                      << " | "
-                      << std::fixed << std::setprecision(2) << ms_per_step << " ms"
-                      << " | " << tok_per_sec << " tok/s"
-                      << (better ? "  *best*" : "")
-                      << "\n";
-                  std::cout.flush();
-
-                  if (iter == MAX_ITERS)
-                        break;
-            }
+      for (int iter = 1; iter <= MAX_ITERS && !g_interrupted; ++iter)
+      {
+            double step_start = wall_secs();
 
-            // Sample training batch
+            // train step
             std::pair<std::vector<int>, std::vector<int>> batch =
                 dl.get_batch("train", BATCH_SIZE, BLOCK_SIZE, rng);
 
-            // Forward — saves all intermediate activations
             SavedForward saved = forward_save(model,
                                               batch.first, BATCH_SIZE, BLOCK_SIZE,
                                               batch.second, /*training=*/true);
 
-            //  Backward — exact analytical gradients
-            Grads grads = backward(model, saved);
+            float batch_loss = model.forward(batch.first, BATCH_SIZE, BLOCK_SIZE,
+                                             batch.second, false)
+                                   .second;
 
-            // AdamW parameter update
+            Grads grads = backward(model, saved);
             apply_grads(model, grads, opt);
-      }
 
-      double total = wall_secs() - train_start;
-      std::cout << "\n[DONE]  Training finished in "
-                << std::fixed << std::setprecision(1) << total << "s ("
-                << total / 60.0 << " min)  |  Best val loss: "
-                << std::setprecision(4) << best_val_loss << "\n";
-      std::cout << "[SAVE]  Best weights saved to " << model_path << "\n";
+            double step_ms = (wall_secs() - step_start) * 1000.0;
+            int tok_per_sec = (step_ms > 0.0)
+                                  ? (int)((long)BATCH_SIZE * BLOCK_SIZE / (step_ms / 1000.0))
+                                  : 0;
 
-      //  Continuous generation
-      std::cout << "\n"
-                << std::string(60, '-') << "\n";
-      std::cout << "  MODEL OUTPUT  (Ctrl+C to stop)\n";
-      std::cout << std::string(60, '-') << "\n\n";
+            // every EVAL_INTERVAL steps: compute val, save if best, sample
+            bool better = false;
+            if (iter % EVAL_INTERVAL == 0 || iter == MAX_ITERS)
+            {
+                  last_val_loss = estimate_loss(model, dl, "val", rng);
+                  if (last_val_loss < best_val_loss)
+                  {
+                        best_val_loss = last_val_loss;
+                        model.save(model_path);
+                        better = true;
+                  }
+            }
 
-      model.load(model_path);
-      model.rng = std::mt19937(SEED + 42);
+            // print every step
+            std::cout
+                << "step"
+                << std::setw(5) << iter << "/" << MAX_ITERS
+                << " | loss "
+                << std::fixed << std::setprecision(6) << batch_loss
+                << " | val "
+                << std::fixed << std::setprecision(6) << last_val_loss
+                << " | lr "
+                << std::scientific << std::setprecision(2) << (float)LEARNING_RATE
+                << " | "
+                << std::fixed << std::setprecision(2) << step_ms << " ms"
+                << " | " << tok_per_sec << " tok/s"
+                << (better ? "  *best*" : "")
+                << "\n";
+            std::cout.flush();
 
-      std::vector<int> ctx = {0};
-      while (!g_interrupted)
-      {
-            ctx = model.generate(ctx, 1);
-            std::cout << dl.decode({ctx.back()}) << std::flush;
-            if ((int)ctx.size() > BLOCK_SIZE)
-                  ctx = std::vector<int>(ctx.end() - BLOCK_SIZE, ctx.end());
+            // sample after every eval window
+            if (iter % EVAL_INTERVAL == 0 || iter == MAX_ITERS)
+            {
+                  std::cout << "generating:\n";
+                  sample_tokens(model, dl, iter == MAX_ITERS ? 10000 : 150);
+            }
       }
 
-      std::cout << "\n\n[Stopped by user]\n";
-      std::cout << "[TOTAL] Wall-clock: "
-                << std::fixed << std::setprecision(1)
-                << (wall_secs() - train_start) << "s\n";
       return 0;
 }
\ No newline at end of file
diff --git a/run.md b/run.md
deleted file mode 100644
index a2c0e65..0000000
--- a/run.md
+++ /dev/null
@@ -1,492 +0,0 @@
-# Quadtrix.cpp
-
-Quadtrix.cpp is a local GPT-style language model project with multiple runtime paths:
-
-- Native C++ inference and training through `Quadtrix.exe` / `main.cpp`
-- PyTorch checkpoint inference through `engine/inference.py` and `engine/best_model .pt`
-- FastAPI middleware in `backend/`
-- React + TypeScript chat UI in `frontend/`
-
-The web interface can chat with both model backends:
-
-- `C++`: calls the C++ HTTP server on port `8080`
-- `.pt`: loads the PyTorch checkpoint directly from `engine/best_model .pt`
-
-## Project Layout
-
-```text
-Quadtrix.cpp/
-  Quadtrix.exe
-  main.cpp
-  config/
-  include/
-  data/
-  engine/
-    inference.py
-    main.py
-    fine-tune/main.py
-    best_model .pt
-    fineweb_30mb.txt
-  backend/
-    main.py
-    inference.py
-    requirements.txt
-  frontend/
-    package.json
-    src/
-```
-
-## Requirements
-
-- Python 3.10+
-- Node.js 18+
-- npm
-- C++17 compiler if you want to rebuild the C++ executable
-
-## 1. Python Setup
-
-From the repo root:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-python -m venv .venv
-.\.venv\Scripts\python.exe -m pip install --upgrade pip
-```
-
-Install backend and PyTorch inference dependencies:
-
-```powershell
-cd backend
-..\.venv\Scripts\python.exe -m pip install -r requirements.txt
-```
-
-## 2. Frontend Setup
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend
-npm.cmd install
-npm.cmd run build
-```
-
-Run the frontend:
-
-```powershell
-npm.cmd run dev
-```
-
-Frontend URL:
-
-```text
-http://localhost:5173
-```
-
-## Install as a Web App
-
-The frontend is configured as an installable PWA. It includes:
-
-- `frontend/manifest.webmanifest`
-- `frontend/sw.js`
-- `frontend/public/manifest.webmanifest`
-- `frontend/public/sw.js`
-- service worker registration in `frontend/src/registerServiceWorker.ts`
-
-For the clean installable version, build and preview the frontend:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend
-npm.cmd run build
-npm.cmd run preview
-```
-
-Open the preview URL, usually:
-
-```text
-http://localhost:4173
-```
-
-Then install from the browser:
-
-- Chrome / Edge: click the install icon in the address bar
-- Or open browser menu -> Apps -> Install this site as an app
-
-The installed app still talks to the backend at:
-
-```text
-http://localhost:3001
-```
-
-So keep the FastAPI backend running when chatting.
-
-## 3. Run the PyTorch `.pt` Model in the Web UI
-
-The `.pt` model does not need a separate model server. The FastAPI backend loads it directly from:
-
-```text
-engine/best_model .pt
-```
-
-Start the backend:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\backend
-..\.venv\Scripts\python.exe -m uvicorn main:app --host 127.0.0.1 --port 3001
-```
-
-Start the frontend in another terminal:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend
-npm.cmd run dev
-```
-
-Open:
-
-```text
-http://localhost:5173
-```
-
-Select `.pt` in the top bar.
-
-## 4. Run the C++ Model in the Web UI
-
-Start the C++ inference server:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-.\Quadtrix.exe data\input.txt --server --port 8080
-```
-
-Start the backend:
-
-```powershell
-cd backend
-..\.venv\Scripts\python.exe -m uvicorn main:app --host 127.0.0.1 --port 3001
-```
-
-Start the frontend:
-
-```powershell
-cd ..\frontend
-npm.cmd run dev
-```
-
-Open:
-
-```text
-http://localhost:5173
-```
-
-Select `C++` in the top bar.
-
-## 5. Run Both Backends Together
-
-Use three terminals.
-
-Terminal 1:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-.\Quadtrix.exe data\input.txt --server --port 8080
-```
-
-Terminal 2:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\backend
-..\.venv\Scripts\python.exe -m uvicorn main:app --host 127.0.0.1 --port 3001
-```
-
-Terminal 3:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend
-npm.cmd run dev
-```
-
-Open:
-
-```text
-http://localhost:5173
-```
-
-Switch between `C++` and `.pt` from the model selector.
-
-## 6. Backend API
-
-Base URL:
-
-```text
-http://localhost:3001
-```
-
-Routes:
-
-```text
-GET    /api/health
-GET    /api/stats
-POST   /api/chat
-GET    /api/sessions
-POST   /api/sessions
-DELETE /api/sessions/{id}
-GET    /api/sessions/{id}/messages
-POST   /api/feedback
-```
-
-Example `.pt` chat request:
-
-```powershell
-Invoke-RestMethod `
-  -Uri http://localhost:3001/api/chat `
-  -Method Post `
-  -ContentType "application/json" `
-  -Body '{
-    "session_id": null,
-    "prompt": "Once upon a time",
-    "max_tokens": 100,
-    "temperature": 1.0,
-    "stream": false,
-    "model_backend": "torch"
-  }'
-```
-
-Example C++ chat request:
-
-```powershell
-Invoke-RestMethod `
-  -Uri http://localhost:3001/api/chat `
-  -Method Post `
-  -ContentType "application/json" `
-  -Body '{
-    "session_id": null,
-    "prompt": "Once upon a time",
-    "max_tokens": 100,
-    "temperature": 1.0,
-    "stream": false,
-    "model_backend": "cpp"
-  }'
-```
-
-## 7. Environment Variables
-
-Backend defaults are in `backend/.env.example`:
-
-```text
-API_PORT=3001
-CORS_ORIGINS=http://localhost:5173
-REDIS_URL=
-LOG_LEVEL=INFO
-MAX_SESSIONS=1000
-SESSION_TTL_HOURS=24
-CPP_SERVER_URL=http://localhost:8080
-TORCH_CHECKPOINT_PATH=../engine/best_model .pt
-REQUEST_TIMEOUT_SECONDS=60
-```
-
-Create `backend/.env` if you want overrides.
-
-Frontend defaults are in `frontend/.env.example`:
-
-```text
-VITE_API_BASE_URL=http://localhost:3001
-```
-
-## 8. PyTorch CLI Inference
-
-Interactive chat:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-.\.venv\Scripts\python.exe engine\inference.py --checkpoint "engine\best_model .pt"
-```
-
-Generate once:
-
-```powershell
-.\.venv\Scripts\python.exe engine\inference.py --checkpoint "engine\best_model .pt" --prompt "Hello" --max-new-tokens 100 --temperature 1.0
-```
-
-## 9. PyTorch Training
-
-Main training:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-.\.venv\Scripts\python.exe engine\main.py
-```
-
-Fine-tuning:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-.\.venv\Scripts\python.exe engine\fine-tune\main.py
-```
-
-## 10. C++ Build and Run
-
-Build manually:
-
-```powershell
-g++ -std=c++17 -O2 -I. -Iinclude -o Quadtrix.exe main.cpp
-```
-
-Train from scratch:
-
-```powershell
-.\Quadtrix.exe data\input.txt
-```
-
-Terminal chat:
-
-```powershell
-.\Quadtrix.exe data\input.txt --chat
-```
-
-Raw generation:
-
-```powershell
-.\Quadtrix.exe data\input.txt --generate
-```
-
-HTTP server:
-
-```powershell
-.\Quadtrix.exe data\input.txt --server --port 8080
-```
-
-## 11. Health Checks
-
-Backend:
-
-```powershell
-Invoke-RestMethod http://localhost:3001/api/health
-```
-
-C++ server:
-
-```powershell
-Invoke-RestMethod http://localhost:8080/health
-```
-
-Frontend:
-
-```text
-http://localhost:5173
-```
-
-When only `.pt` is available, backend health should show:
-
-```json
-{
-  "status": "degraded",
-  "api": "ok",
-  "cpp_server": "unreachable",
-  "torch_model": "ok"
-}
-```
-
-When both are available, backend health should show:
-
-```json
-{
-  "status": "ok",
-  "api": "ok",
-  "cpp_server": "ok",
-  "torch_model": "ok"
-}
-```
-
-## 12. Troubleshooting
-
-### PowerShell blocks `npm`
-
-Use `npm.cmd`:
-
-```powershell
-npm.cmd run dev
-npm.cmd run build
-```
-
-### `.pt` model is unavailable
-
-Check that this file exists:
-
-```text
-engine/best_model .pt
-```
-
-Then check Python dependencies:
-
-```powershell
-cd backend
-..\.venv\Scripts\python.exe -c "import torch, tiktoken; print(torch.__version__)"
-```
-
-### Backend cannot import FastAPI
-
-Install dependencies into the repo venv:
-
-```powershell
-cd backend
-..\.venv\Scripts\python.exe -m pip install -r requirements.txt
-```
-
-### C++ option is offline
-
-Start the C++ server:
-
-```powershell
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-.\Quadtrix.exe data\input.txt --server --port 8080
-```
-
-### Frontend cannot reach backend
-
-Check:
-
-```text
-http://localhost:3001/api/health
-```
-
-Make sure frontend config points to:
-
-```text
-VITE_API_BASE_URL=http://localhost:3001
-```
-
-### Port already in use
-
-```powershell
-Get-NetTCPConnection -LocalPort 3001
-Get-NetTCPConnection -LocalPort 5173
-Get-NetTCPConnection -LocalPort 8080
-```
-
-## Recommended Daily Run
-
-```powershell
-# Terminal 1
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp
-.\Quadtrix.exe data\input.txt --server --port 8080
-```
-
-```powershell
-# Terminal 2
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\backend
-..\.venv\Scripts\python.exe -m uvicorn main:app --host 127.0.0.1 --port 3001
-```
-
-```powershell
-# Terminal 3
-cd C:\Users\Admin\Documents\GitHub\Quadtrix.cpp\frontend
-npm.cmd run dev
-```
-
-Open:
-
-```text
-http://localhost:5173
-```
-
-## License
-
-MIT
diff --git a/scripts/build.sh b/scripts/build.sh
new file mode 100644
index 0000000..e36678b
--- /dev/null
+++ b/scripts/build.sh
@@ -0,0 +1,161 @@
+
+# Quadtrix.cpp — build.sh  
+# Usage
+#   ./scripts/build.sh               # full stack, CPU
+#   ./scripts/build.sh dev           # hot-reload dev mode
+#   ./scripts/build.sh gpu           # CUDA backend
+#   ./scripts/build.sh cpp-only      # compile + run C++ engine
+#   ./scripts/build.sh train-cpp     # train with C++ backend
+#   ./scripts/build.sh train-torch   # train with PyTorch backend
+#   ./scripts/build.sh bench         # run benchmark
+#   ./scripts/build.sh clean         # remove containers + volumes
+#   ./scripts/build.sh logs          # tail all service logs
+
+set -euo pipefail
+
+BOLD="\033[1m"
+GREEN="\033[0;32m"
+CYAN="\033[0;36m"
+YELLOW="\033[1;33m"
+RED="\033[0;31m"
+RESET="\033[0m"
+
+info()    { echo -e "${CYAN}[quadtrix]${RESET} $*"; }
+success() { echo -e "${GREEN}[quadtrix]${RESET} $*"; }
+warn()    { echo -e "${YELLOW}[quadtrix]${RESET} $*"; }
+error()   { echo -e "${RED}[quadtrix] ERROR:${RESET} $*" >&2; }
+
+COMPOSE_BASE="docker compose -f docker-compose.yml"
+COMPOSE_DEV="${COMPOSE_BASE} -f docker-compose.dev.yml"
+COMPOSE_GPU="${COMPOSE_BASE} -f docker-compose.gpu.yml"
+
+check_docker() {
+    if ! docker info &>/dev/null; then
+        error "Docker daemon is not running. Start Docker Desktop or the Docker service."
+        exit 1
+    fi
+}
+
+check_nvidia() {
+    if ! command -v nvidia-smi &>/dev/null; then
+        warn "nvidia-smi not found — GPU mode may not work."
+    else
+        info "GPU detected: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
+    fi
+}
+
+pull_cache() {
+    info "Pulling build cache images (if available)..."
+    $COMPOSE_BASE pull --ignore-pull-failures 2>/dev/null || true
+}
+
+cmd_up() {
+    check_docker
+    info "Starting full stack (CPU)..."
+    $COMPOSE_BASE up --build -d
+    success "Stack is up."
+    echo ""
+    echo -e "  ${BOLD}Frontend:${RESET}  http://localhost:5173"
+    echo -e "  ${BOLD}API:${RESET}       http://localhost:3001/api/health"
+    echo -e "  ${BOLD}Docs:${RESET}      http://localhost:3001/docs"
+}
+
+cmd_dev() {
+    check_docker
+    info "Starting in DEV mode (hot-reload)..."
+    $COMPOSE_DEV up --build
+}
+
+cmd_gpu() {
+    check_docker
+    check_nvidia
+    info "Starting with CUDA GPU support..."
+    $COMPOSE_GPU up --build -d
+    success "GPU stack is up."
+}
+
+cmd_cpp_only() {
+    check_docker
+    info "Compiling and running C++ engine..."
+    $COMPOSE_BASE --profile cpp run --rm cpp "$@"
+}
+
+cmd_train_cpp() {
+    check_docker
+    info "Training with C++ backend..."
+    $COMPOSE_BASE --profile train run --rm train-cpp
+    success "C++ training complete. Checkpoint saved in 'models' volume."
+}
+
+cmd_train_torch() {
+    check_docker
+    info "Training with PyTorch backend..."
+    $COMPOSE_BASE --profile train run --rm train-torch
+    success "PyTorch training complete. Checkpoint saved in 'models' volume."
+}
+
+cmd_bench() {
+    check_docker
+    info "Running benchmark..."
+    $COMPOSE_BASE --profile benchmark run --rm benchmark
+}
+
+cmd_logs() {
+    check_docker
+    $COMPOSE_BASE logs -f --tail=100
+}
+
+cmd_clean() {
+    check_docker
+    warn "This will remove all containers and volumes (including saved models!)"
+    read -r -p "Are you sure? [y/N] " confirm
+    if [[ "${confirm,,}" == "y" ]]; then
+        $COMPOSE_BASE down -v --remove-orphans
+        docker image prune -f --filter "label=org.opencontainers.image.source=https://github.com/Eamon2009/Quadtrix.cpp"
+        success "Cleaned."
+    else
+        info "Aborted."
+    fi
+}
+
+cmd_ps() {
+    $COMPOSE_BASE ps
+}
+
+cmd_shell() {
+    service="${1:-backend}"
+    info "Opening shell in '${service}'..."
+    $COMPOSE_BASE exec "${service}" /bin/sh
+}
+CMD="${1:-up}"
+shift || true
+
+case "${CMD}" in
+    up)           cmd_up "$@" ;;
+    dev)          cmd_dev "$@" ;;
+    gpu)          cmd_gpu "$@" ;;
+    cpp-only)     cmd_cpp_only "$@" ;;
+    train-cpp)    cmd_train_cpp "$@" ;;
+    train-torch)  cmd_train_torch "$@" ;;
+    bench)        cmd_bench "$@" ;;
+    logs)         cmd_logs "$@" ;;
+    clean)        cmd_clean "$@" ;;
+    ps)           cmd_ps "$@" ;;
+    shell)        cmd_shell "$@" ;;
+    *)
+        echo -e "Usage: ./scripts/build.sh ${BOLD}[command]${RESET}"
+        echo ""
+        echo "Commands:"
+        echo "  up           Full stack (CPU) — default"
+        echo "  dev          Hot-reload dev mode"
+        echo "  gpu          CUDA GPU stack"
+        echo "  cpp-only     Run C++ engine CLI"
+        echo "  train-cpp    Train with C++ backend"
+        echo "  train-torch  Train with PyTorch"
+        echo "  bench        Benchmark"
+        echo "  logs         Tail logs"
+        echo "  ps           Show container status"
+        echo "  shell [svc]  Shell into service (default: backend)"
+        echo "  clean        Remove all containers + volumes"
+        ;;
+esac