diff --git a/.github/workflows/mirror-chart.yml b/.github/workflows/mirror-chart.yml index 2e975e05..f08c3c61 100644 --- a/.github/workflows/mirror-chart.yml +++ b/.github/workflows/mirror-chart.yml @@ -38,7 +38,7 @@ jobs: repository: cloudzero/cloudzero-charts ref: ${{ github.ref }} - - name: Mirror helm directory + - name: Mirror helm directory and changelog files run: | export GIT_COMMITTER_NAME="github-actions[bot]" export GIT_COMMITTER_EMAIL="github-actions[bot]@users.noreply.github.com" @@ -46,7 +46,11 @@ jobs: while read REVISION; do git checkout $REVISION - rsync -av --del --exclude='.git' helm/ cloudzero-charts/charts/cloudzero-agent + # Mirror helm directory (excluding legacy release files) + rsync -av --del --exclude='.git' --exclude='docs/releases/' helm/ cloudzero-charts/charts/cloudzero-agent + # Mirror only the centralized changelog files to the chart's docs/releases directory + mkdir -p cloudzero-charts/charts/cloudzero-agent/docs/releases + rsync -av docs/releases/ cloudzero-charts/charts/cloudzero-agent/docs/releases/ || true export GIT_AUTHOR_NAME="$(git show -s --format='%an' $REVISION)" export GIT_AUTHOR_EMAIL="$(git show -s --format='%ae' $REVISION)" MESSAGE="$(git show -s --format='%B' $REVISION)" diff --git a/.github/workflows/release-to-main.yml b/.github/workflows/release-to-main.yml index 52ec5146..fc8d093d 100644 --- a/.github/workflows/release-to-main.yml +++ b/.github/workflows/release-to-main.yml @@ -24,8 +24,10 @@ jobs: persist-credentials: true fetch-depth: 0 # fetch the whole repo history - - name: Verify release notes exist - run: test -f "helm/docs/releases/${{ github.event.inputs.version }}.md" + - name: Verify changelog exists + run: | + MINOR_VERSION=$(echo "${{ github.event.inputs.version }}" | cut -d. -f1,2) + test -f "docs/releases/CHANGELOG-${MINOR_VERSION}.md" - name: Setup Git run: | @@ -59,11 +61,32 @@ jobs: - name: Push changes to main run: git push --atomic origin develop main "v${{ github.event.inputs.version }}" + - name: Extract release notes from changelog + run: | + MINOR_VERSION=$(echo "${{ github.event.inputs.version }}" | cut -d. -f1,2) + CHANGELOG_FILE="docs/releases/CHANGELOG-${MINOR_VERSION}.md" + + # Extract version-specific content from changelog + awk ' + /^## .*'"${{ github.event.inputs.version }}"'/ { found=1; next } + /^## / && found { exit } + found { print } + ' "$CHANGELOG_FILE" > release_notes.md + + # If no version-specific section found, use the latest changes + if [ ! -s release_notes.md ]; then + echo "No specific section found for ${{ github.event.inputs.version }}, using latest changes from changelog" + awk ' + /^## / { if (first_section) exit; first_section=1; next } + first_section { print } + ' "$CHANGELOG_FILE" > release_notes.md + fi + - name: Create Release uses: softprops/action-gh-release@v2 with: token: ${{ secrets.GITHUB_TOKEN }} tag_name: v${{ github.event.inputs.version }} - body_path: helm/docs/releases/${{ github.event.inputs.version }}.md + body_path: release_notes.md prerelease: ${{ contains(github.event.inputs.version, '-beta-') || contains(github.event.inputs.version, '-rc-') }} draft: true diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..80432c74 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,204 @@ +# CloudZero Agent Development Guide + +## Repository Overview + +This repository contains the complete CloudZero Agent ecosystem for Kubernetes integration with the CloudZero platform. It includes multiple applications, a comprehensive Helm chart, and supporting tools. + +### Key Components + +- **Applications**: Insights Controller, Collector, Shipper, Agent Validator, Agent (supports federated mode) +- **Helm Chart**: Complete Kubernetes deployment with automated testing and validation +- **Release Process**: Automated chart mirroring to `cloudzero-charts` repository on `develop` branch pushes +- **Testing**: Comprehensive test suites including unit, integration, helm schema, and smoke tests + +### Development Commands + +- `make build` - Build all binaries +- `make test` - Run unit tests +- `make helm-test` - Run helm validation tests +- `make package-build` - Build Docker images locally +- `TAG_VERSION=X.Y.Z make generate-changelog` - Generate/update changelog + +### Release Process + +1. Release notes must exist in `helm/docs/releases/{version}.md` +2. Manual "Prepare Release" GitHub workflow merges `develop` to `main` +3. Automatic chart mirroring syncs `helm/` to `cloudzero-charts` repository +4. Tags and GitHub releases are created automatically + +## Changelog Format Guidelines + +When generating or updating changelog files for CloudZero Agent releases, follow these formatting guidelines based on the existing changelog structure: + +### File Structure + +- **Filename**: `docs/releases/CHANGELOG-X.Y.md` (where X.Y is the minor version, e.g., 1.2) +- **Title**: `# CloudZero Agent X.Y.Z - Changelog` + +### Content Structure + +1. **Overview Section** + + - Brief summary of the release series + - Highlight major themes or features introduced + +2. **Major Features Section** + + - List new significant features with descriptive subsections + - Use `###` for feature names + - Include bullet points with detailed descriptions + - Focus on user-facing capabilities and benefits + +3. **Performance/Architecture Improvements** + + - Separate section for performance enhancements + - Include monitoring, efficiency, and architectural changes + - Use clear metrics when available (e.g., "every 1 minute previously 2 minutes") + +4. **Configuration Changes** + + - New configuration options with default values + - Breaking changes in configuration + - API key management changes + +5. **Bug Fixes Section** - CRITICAL PLACEMENT RULES + + - **Section Header**: "## Reliability and Bug Fixes" followed by "### Major Bug Fixes Across X.Y.Z Series" + - **Version Subsections**: Organize by version using "#### X.Y.Z Fixes" format + - **New Versions**: Add new version fixes in chronological order WITHIN this existing section + - **Location**: NEVER add content after "## Upgrade Path" or "## Version History" + - **Format**: Use bullet points with "**Issue**: Description" format + +6. **Breaking Changes** + + - Clearly list any breaking changes + - Explain impact and migration requirements + +7. **Security Section** (if applicable) + + - Vulnerability status + - Security improvements + +8. **Upgrade Path** + + - Provide clear upgrade instructions + - Include helm commands with version placeholders + +9. **Version History** + - List all versions in the series with release dates + - Brief description of each version's focus + - UPDATE this section to include the new version being added + +### CRITICAL SECTION PLACEMENT RULES + +When adding content for a new version (e.g., 1.2.3): + +1. **Major Features**: Add new subsections under existing "## Major Features" section using "### Feature Name" format +2. **Performance Improvements**: Add under existing "## Performance and Efficiency Improvements" section +3. **Bug Fixes**: Add under existing "### Major Bug Fixes Across 1.2.X Series" as "#### 1.2.3 Fixes" +4. **Version History**: Update the existing list to include the new version +5. **NEVER**: Add content after "## Upgrade Path" section +6. **NEVER**: Create duplicate section headers +7. **ALWAYS**: Maintain existing document structure and only INSERT within existing sections + +### Content Classification Guidelines + +**Major Features** (add to "## Major Features"): + +- New standalone applications or services (e.g., collector, shipper, webhook) +- Significant user-facing capabilities +- Configuration simplification that reduces manual setup +- New integration support (cloud providers, tools) +- Auto-detection and zero-configuration capabilities + +**IMPORTANT**: Distinguish between: + +- **Standalone Applications**: collector, shipper, webhook, validator (deployed as separate containers) +- **Shared Packages/Libraries**: scout, utils, storage packages (used across applications) +- **Configuration Features**: Auto-detection capabilities enabled by shared packages + +**Performance Improvements** (add to "## Performance and Efficiency Improvements"): + +- Speed enhancements, memory optimizations +- Reduced resource usage +- Improved scalability + +**Bug Fixes** (add to "### Major Bug Fixes Across 1.2.X Series"): + +- Issue resolution, error fixes +- Template/validation improvements +- Certificate handling fixes + +### Example of Correct Section Placement + +```markdown +## Major Features + +### Existing Feature 1 + +...existing content... + +### Existing Feature 2 + +...existing content... + +### Configuration Automation ← ADD NEW MAJOR FEATURES HERE + +- **Cloud Provider Detection**: Automatic CSP metadata detection +- **Configuration Simplification**: Reduces manual configuration requirements +- **Multi-Cloud Support**: AWS and Google Cloud integration + +## Performance and Efficiency Improvements + +### Existing Performance Section + +...existing content... + +### Configuration Automation ← ADD PERFORMANCE IMPROVEMENTS HERE + +- **Reduced Manual Setup**: Scout eliminates need for manual region/account configuration +- **Faster Deployment**: Automatic environment detection speeds up installation + +## Reliability and Bug Fixes + +### Major Bug Fixes Across 1.2.X Series + +#### 1.2.3 Fixes ← ADD BUG FIXES HERE + +- **Certificate Handling**: Fixed webhook certificate annotations +- **Template Validation**: Enhanced kubeconform integration + +## Version History + +- **1.2.3** (2025-06-27): Major release with Scout application and configuration automation ← UPDATE THIS +``` + +### Formatting Guidelines + +- Use consistent bullet point formatting (`-` for main points) +- Use `**Bold**` for emphasis on key terms and features +- Use code blocks for configuration examples and commands +- Use consistent date format: (YYYY-MM-DD) +- Group related changes under logical subsections +- Use present tense for describing features ("Provides", "Enables", "Supports") +- Focus on user benefits rather than technical implementation details + +### Language Style + +- Write user-focused descriptions +- Emphasize benefits and improvements +- Use clear, non-technical language where possible +- Be specific about improvements (include metrics when available) +- Maintain consistent tone across sections + +### Example Entry Structure + +```markdown +### Feature Name + +- **Key Capability**: Description of what it does for users +- **Benefits**: How it helps users +- **Configuration**: Any relevant configuration details +``` + +This format ensures consistency across all CloudZero Agent changelog files and provides clear, actionable information for users upgrading or reviewing changes. diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index aeecf9a2..34651345 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -1,147 +1,309 @@ -# Developers Quick Start +# CloudZero Agent Development Guide -## Prerequisites +## Repository Overview -Before getting started with the development of the CloudZero Agent Validator, make sure you have the following prerequisites installed on your system: +This repository contains the complete CloudZero Agent ecosystem for Kubernetes integration with the CloudZero platform. It includes: -- [Go 1.24+](https://go.dev/doc/install) -- [Docker](https://docs.docker.com/engine/install/) -- [Rancher Desktop - recommended for testing](https://ranchermanager.docs.rancher.com/getting-started/installation-and-upgrade) -- [Github Actions Utility - for local ci/cd testing](https://github.com/nektos/act) -- [Protocol buffer](https://developers.google.com/protocol-buffers) compiler, `protoc`, [version 3](https://protobuf.dev/programming-guides/proto3). -- **Go plugins** for the protocol compiler: +### Core Applications - 1. Install the protocol compiler plugins for Go using the following commands: +- **CloudZero Insights Controller** - Webhook application that collects resource labels, annotations, and metadata +- **CloudZero Collector** - Prometheus-compatible metrics collection service +- **CloudZero Shipper** - File monitoring and S3 upload service +- **CloudZero Agent Validator** - Installation validation and lifecycle management +- **CloudZero Agent** - Metrics scraping service (supports federated mode for large clusters) - ```sh - $ go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.28 - $ go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.2 - ``` +### Helm Chart - 2. Update your `PATH` so that the `protoc` compiler can find the plugins: +- **Helm Chart** (`helm/`) - Complete Kubernetes deployment configuration +- **Chart Testing** - Comprehensive validation and testing framework +- **Schema Validation** - JSON schema validation for configuration - ```sh - $ export PATH="$PATH:$(go env GOPATH)/bin" - ``` +### Utilities and Tools -## Development Quick Start +- **Helmless Tool** - Configuration analysis and minimal override generation +- **Agent Inspector** - Debugging and inspection utilities +- **Scout** - Cloud provider metadata detection (AWS, Google Cloud) -To quickly get started with the development of the CloudZero Agent Validator, follow these steps: +## Development Setup -### 1. Repository Setup +### Prerequisites -1. Clone the repository: +- [Go 1.24+](https://go.dev/doc/install) +- [Docker](https://docs.docker.com/engine/install/) +- [Helm 3.14+](https://helm.sh/docs/intro/install/) +- [Kubernetes cluster](https://kubernetes.io/docs/setup/) (local or remote) +- [Protocol Buffers compiler](https://protobuf.dev/downloads/) - ```sh - $ git clone https://github.com/Cloudzero/cloudzero-agent.git +### Quick Start + +1. **Clone and Setup** + + ```bash + git clone https://github.com/cloudzero/cloudzero-agent.git + cd cloudzero-agent + make install-tools ``` -2. Change to the project directory: +2. **Build Applications** - ```sh - $ cd cloudzero-agent + ```bash + make build ``` -### 2. Building the Code +3. **Run Tests** -1. Install the project dependencies: + ```bash + make test + make helm-test + ``` - ```sh - $ go mod download - $ make install-tools +4. **Build Docker Images** + ```bash + make package-build ``` -2. Generate the status protobuf definition package: +## Development Workflow + +### Code Organization + +``` +app/ +├── functions/ # Standalone applications +│ ├── agent-validator/ +│ ├── collector/ +│ ├── shipper/ +│ └── webhook/ +├── domain/ # Core business logic +├── handlers/ # HTTP handlers +├── storage/ # Data persistence +└── types/ # Shared types and protobuf definitions + +helm/ # Helm chart +├── templates/ # Kubernetes manifests +├── values.yaml # Default configuration +└── docs/ # Chart documentation + +tests/ # Test suites +├── helm/ # Helm chart tests +├── integration/ # Integration tests +└── smoke/ # Smoke tests +``` + +### Key Make Targets + +```bash +# Development +make build # Build all binaries +make test # Run unit tests +make test-integration # Run integration tests +make lint # Run linters +make format # Format code + +# Docker +make package-build # Build Docker images locally +make package # Build and push Docker images + +# Helm +make helm-install # Install chart locally +make helm-test # Run helm validation tests +make helm-lint # Lint helm chart +make helm-template # Generate templates + +# Changelog +make generate-changelog # Generate changelog (TAG_VERSION=1.2.3) +``` + +### Environment Configuration + +Create `local-config.mk` for local overrides: + +```makefile +# API Configuration +CLOUDZERO_DEV_API_KEY=your-dev-api-key +CLOUDZERO_HOST=dev-api.cloudzero.com + +# Cluster Configuration +CLUSTER_NAME=my-test-cluster +CLOUD_ACCOUNT_ID=123456789 +CSP_REGION=us-east-1 +``` + +## Release Process + +### Overview + +The CloudZero Agent follows a structured release process with automated chart mirroring to the [cloudzero-charts](https://github.com/cloudzero/cloudzero-charts) repository. + +### Release Workflow + +1. **Development** - Work on `develop` branch +2. **Chart Mirroring** - Automatic sync to `cloudzero-charts` on push to `develop` +3. **Release Preparation** - Manual workflow creates release branch and tags +4. **Release Notes** - Must exist in `helm/docs/releases/{version}.md` + +### Creating a Release - ```sh - $ make generate +1. **Prepare Release Notes** + + ```bash + # Create release notes file + touch helm/docs/releases/1.2.3.md + # Add release content following existing format ``` -3. Build the binary: +2. **Generate Changelog** (Optional) - ```sh - $ make build + ```bash + TAG_VERSION=1.2.3 make generate-changelog ``` -### 3. Local Testing +3. **Trigger Release** -To run the go formatter, go linter, unit tests to verify code changes, use the following command: + - Go to GitHub Actions + - Run "Manual Prepare Release" workflow + - Input version (e.g., `1.2.3`) -```sh -make format lint test -``` +4. **Release Process** + - Updates image version in Helm chart + - Merges `develop` into `main` + - Creates Git tag + - Creates GitHub release (draft) -### 4. CI/CD Testing +### Chart Mirroring -Several workflows are defined in the [.github/workflows](.github/workflows) directory to ensure code quality. Before opening a pull request, it is recommended to run these workflows. +The `mirror-chart.yml` workflow automatically: -#### Listing Available Workflows +- Syncs `helm/` directory to `cloudzero-charts/charts/cloudzero-agent/` +- Preserves commit history and authorship +- Runs on every push to `develop` branch -To see the available workflows, run: +## Testing -```sh -act --container-architecture linux/arm64 -l +### Unit Tests + +```bash +make test ``` -**Example:** +### Integration Tests -```sh -$ act --container-architecture linux/arm64 -l -Stage Job ID Job name Workflow name Workflow file Events -0 docker docker DockerBuild docker-build.yml push,pull_request,release -0 build build GoTest golang-build.yml push -0 format format GoFmt golang-fmt.yml push -0 lint lint GoLint golang-lint.yml push -0 release-to-main release-to-main Manual Prepare Release release-to-main.yml workflow_dispatch +```bash +export CLOUDZERO_DEV_API_KEY=your-key +make test-integration ``` -#### Linting a Workflow +### Helm Chart Tests -If you are working on the CI/CD Action Workflows, it is useful to perform a `--dry-run` on the workflow to ensure the syntax is valid. To do this, run: +```bash +make helm-test # All helm tests +make helm-test-schema # Schema validation +make helm-test-subchart # Subchart tests +``` -```sh -$ act --container-architecture linux/arm64 --dryrun -j lint +### Smoke Tests + +```bash +make test-smoke ``` -#### Running a Workflow +## Debugging -To run a workflow, use the `Job ID` value and pass it into the following command: +### Local Development -```sh -$ act --container-architecture linux/arm64 -j lint -... -[GoLint/lint] Cleaning up container for job lint -[GoLint/lint] 🏁 Job succeeded -``` +1. **Use Debug Images** + + ```bash + make package-build-debug + ``` + +2. **Deploy Debug Container** + + ```bash + kubectl run debug --image=busybox:stable-uclibc --rm -it -- sh + ``` + +3. **Monitor Application Logs** + ```bash + kubectl logs -f deployment/cloudzero-agent + ``` + +### Common Issues -For more examples, [see the README in the workflow directory](./.github/workflows/README.md). +- **Certificate Issues** - Check `docs/cert-trouble-shooting.md` +- **Validation Failures** - See `docs/deploy-validation.md` +- **Storage Issues** - Review `docs/storage/` guides -Remember to refer to the available targets in the Makefile for other development tasks. +## Configuration ---- +### Helm Values -# Release Process +Key configuration areas: -Publishing a new release can be accomplished by running the `Manual Prepare Release` workflow. +- **API Authentication** - `global.apiKey`, `global.existingSecretName` +- **Cluster Identification** - `clusterName`, `cloudAccountId`, `region` +- **Component Control** - `components.*.enabled` +- **Resource Limits** - `components.*.resources` + +### Environment Variables + +Applications support configuration via: + +- Helm chart values +- Environment variables +- ConfigMaps +- Secrets + +## Contributing + +1. **Follow existing patterns** - Review similar components +2. **Add tests** - Unit tests for new functionality +3. **Update documentation** - Keep docs current +4. **Validate changes** - Run full test suite + +### Code Style + +- Use `gofumpt` for formatting +- Follow Go best practices +- Add godoc comments for public APIs +- Use structured logging + +### Commit Messages + +Follow conventional commit format: + +``` +type(scope): description + +- feat: new feature +- fix: bug fix +- docs: documentation +- test: testing +- refactor: code refactoring +``` -![](./docs/assets/release-1.png) +## Troubleshooting -**Once run the following occurs:** +### Build Issues -1. _All changes on the `develop` branch_ are merged into the `main` branch. -2. A new semver `tag` is created. -3. A new `pre-release` is created, with the `change log` for changes since the last release. +- Ensure Go version matches `go.mod` +- Run `make install-tools` to install dependencies +- Check Docker daemon is running -Next we can visit the release page, and locate the `pre-release` and `click the edit icon`: -![](./docs/assets/release-2.png) +### Test Failures -Finally - we will publish the `draft-release`. Make sure you: +- Verify API key is set for integration tests +- Ensure Kubernetes cluster is accessible +- Check resource limits and permissions -1. Remove the `draft` checkbox -2. Update _`Set as pre-release`_ to **`Set as the latest release`** +### Deployment Issues -![](./docs/assets/release-3.png) +- Validate Helm chart with `make helm-lint` +- Check cluster permissions +- Review application logs -When this is done, it will cause an automated release of the `docker image` for the release value, and `latest` to be created in GHCR. +## Additional Resources -That's it, Happy coding! +- [Configuration Guide](../CONFIGURATION.md) - Detailed configuration options +- [Usage Guide](../USAGE.md) - Usage examples and patterns +- [Contributing Guide](../CONTRIBUTING.md) - Contribution guidelines +- [Release Process](releases/RELEASE_PROCESS.md) - Detailed release procedures diff --git a/Makefile b/Makefile index 3e783d29..938a9ae1 100644 --- a/Makefile +++ b/Makefile @@ -499,3 +499,16 @@ app/types/status/cluster_status.pb.go: app/types/status/cluster_status.proto generate: app/types/clusterconfig/clusterconfig.pb.go app/types/clusterconfig/clusterconfig.pb.go: app/types/clusterconfig/clusterconfig.proto @$(PROTOC) --proto_path=$(dir $@) --go_out=$(dir $<) app/types/clusterconfig/clusterconfig.proto + +# ----------- CHANGELOG GENERATION ------------ + +.PHONY: generate-changelog +generate-changelog: ## Generate or update changelog for specified version (TAG_VERSION=1.2.3 make generate-changelog) + @if [ -z "$(TAG_VERSION)" ]; then \ + echo "$(ERROR_COLOR)Error: TAG_VERSION is required. Usage: TAG_VERSION=1.2.3 make generate-changelog$(NO_COLOR)"; \ + exit 1; \ + fi + @MINOR_VERSION=$$(echo "$(TAG_VERSION)" | cut -d. -f1,2); \ + CHANGELOG_FILE="docs/releases/CHANGELOG-$$MINOR_VERSION.md"; \ + echo "$(INFO_COLOR)Generating changelog for version $(TAG_VERSION) in $$CHANGELOG_FILE$(NO_COLOR)"; \ + claude -p --dangerously-skip-permissions "CRITICAL: Update CloudZero Agent changelog for version $(TAG_VERSION). Analyze BOTH git commits AND code structure to understand architectural changes:\n\nCODE ANALYSIS REQUIRED:\n1. EXAMINE app/ directory structure to understand applications vs packages\n2. IDENTIFY if changes are new standalone apps or shared library enhancements\n3. UNDERSTAND how scout package integrates with existing applications (collector, shipper, webhook)\n4. RECOGNIZE scout as a Go package/library, NOT a standalone application\n\nCONTENT CLASSIFICATION:\n1. **Configuration Features** (scout package capabilities) → Add under '## Performance and Efficiency Improvements' as configuration automation\n2. **Shared Package Enhancements** → Focus on user benefits, not internal implementation\n3. **Bug Fixes** (certificate handling, validation) → Add under '### Major Bug Fixes Across X.Y.Z Series' as '#### $(TAG_VERSION) Fixes'\n\nSPECIFIC FOCUS for $(TAG_VERSION):\n- Scout is a SHARED PACKAGE used across all applications (not a standalone app)\n- Configuration automation is the USER-FACING FEATURE enabled by scout\n- Auto-detection capabilities integrated into collector, shipper, webhook apps\n- Zero-configuration deployment through enhanced app capabilities\n- Certificate/validation improvements are bug fixes\n\nKEY MESSAGING for Configuration Automation:\n- Scout package enables automatic cloud environment detection\n- All CloudZero Agent applications now support auto-configuration\n- Eliminates manual cloudAccountId and region setup across the platform\n- Multi-cloud support (AWS, Google Cloud) built into the agent ecosystem\n\nFocus on user benefits and deployment simplification, not internal package details. Reference CLAUDE.md for architectural understanding. Only modify $$CHANGELOG_FILE." diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 00000000..2045bd44 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,436 @@ +# CloudZero Agent Architecture + +## Overview + +The CloudZero Agent follows hexagonal architecture principles, separating core business logic from external concerns through well-defined interfaces and adapters. This architecture enables testability, maintainability, and flexibility in deployment scenarios. + +## Hexagonal Architecture Overview + +```mermaid +graph TB + subgraph "External World" + K8S[Kubernetes API] + PROM[Prometheus Metrics] + S3[S3 Storage] + FS[File System] + HTTP[HTTP Clients] + end + + subgraph "Adapters (Infrastructure)" + direction TB + CONFIG[Config Adapters] + HANDLERS[HTTP Handlers] + STORAGE[Storage Adapters] + HTTP_CLIENT[HTTP Client] + end + + subgraph "Core Domain" + direction TB + COLLECTOR[Metric Collector] + SHIPPER[File Shipper] + WEBHOOK[Webhook Controller] + VALIDATOR[Validator] + MONITOR[Health Monitor] + end + + subgraph "Applications (Entry Points)" + direction TB + COLLECTOR_APP[Collector App] + SHIPPER_APP[Shipper App] + WEBHOOK_APP[Webhook App] + VALIDATOR_APP[Validator App] + INSPECTOR[Agent Inspector] + end + + K8S --> HANDLERS + PROM --> HANDLERS + HTTP --> HTTP_CLIENT + + HANDLERS --> COLLECTOR + HANDLERS --> WEBHOOK + HTTP_CLIENT --> SHIPPER + CONFIG --> COLLECTOR + CONFIG --> SHIPPER + CONFIG --> WEBHOOK + CONFIG --> VALIDATOR + STORAGE --> COLLECTOR + STORAGE --> SHIPPER + STORAGE --> WEBHOOK + + COLLECTOR_APP --> COLLECTOR + SHIPPER_APP --> SHIPPER + WEBHOOK_APP --> WEBHOOK + VALIDATOR_APP --> VALIDATOR + INSPECTOR --> MONITOR + + SHIPPER --> S3 + COLLECTOR --> FS + SHIPPER --> FS +``` + +## Core Applications (`app/functions/`) + +The `app/functions/` directory contains the main applications that serve as entry points into the system. Each application is a standalone binary with specific responsibilities: + +### Collector (`app/functions/collector/`) + +**Purpose**: Prometheus-compatible metrics collection service + +- Implements `/api/v1/write` remote write endpoint +- Classifies and stores metrics in compressed files +- Separates cost telemetry from observability metrics + +```mermaid +graph LR + PROM[Prometheus] --> COLLECTOR[Collector Service] + COLLECTOR --> COST_FILES[Cost Telemetry Files] + COLLECTOR --> OBS_FILES[Observability Files] + COLLECTOR --> COMPRESSION[File Compression] +``` + +### Shipper (`app/functions/shipper/`) + +**Purpose**: File monitoring and S3 upload service + +- Monitors shared locations for metrics files +- Allocates pre-signed S3 URLs via CloudZero API +- Uploads data at configured intervals +- Provides end-to-end file tracking + +```mermaid +graph LR + FILES[Metrics Files] --> MONITOR[File Monitor] + MONITOR --> ALLOCATE[URL Allocation] + ALLOCATE --> CZ_API[CloudZero API] + CZ_API --> S3_URL[Pre-signed URL] + S3_URL --> UPLOAD[S3 Upload] + UPLOAD --> S3[S3 Bucket] +``` + +### Webhook (`app/functions/webhook/`) + +**Purpose**: Kubernetes admission controller for resource metadata collection + +- ValidatingAdmissionWebhook for Kubernetes resources +- Collects labels, annotations, and relationships +- Supports resource provisioning/deprovisioning tracking + +```mermaid +graph LR + K8S_API[Kubernetes API] --> WEBHOOK[Admission Webhook] + WEBHOOK --> VALIDATE[Resource Validation] + WEBHOOK --> COLLECT[Metadata Collection] + COLLECT --> STORE[Storage Layer] + VALIDATE --> RESPONSE[Admission Response] + RESPONSE --> K8S_API +``` + +### Agent Validator (`app/functions/agent-validator/`) + +**Purpose**: Installation validation and lifecycle management + +- Pod lifecycle hook validation +- CloudZero platform status reporting +- Configuration validation + +### Additional Tools + +- **Agent Inspector** (`app/functions/agent-inspector/`): Debugging and system inspection +- **Helmless** (`app/functions/helmless/`): Configuration analysis and minimal overrides +- **Scout** (`app/functions/scout/`): Cloud provider metadata detection +- **Cluster Config** (`app/functions/cluster-config/`): Configuration loading utilities + +## Domain Layer (`app/domain/`) + +The domain layer contains the core business logic, isolated from external dependencies: + +### Core Domain Services + +```mermaid +classDiagram + class MetricCollector { + +CollectMetrics() + +ClassifyMetrics() + +WriteToFile() + } + + class Shipper { + +MonitorFiles() + +AllocateURL() + +UploadFile() + +TrackProgress() + } + + class WebhookController { + +HandleAdmission() + +ExtractMetadata() + +ValidateResource() + } + + class HealthMonitor { + +CheckHealth() + +ReportStatus() + } + + class FileMonitor { + +WatchDirectory() + +NotifyChanges() + } + + MetricCollector --> FileMonitor + Shipper --> FileMonitor + WebhookController --> HealthMonitor +``` + +### Key Domain Components + +#### Metric Collection (`app/domain/metric_collector.go`) + +- **Responsibility**: Core metrics processing logic +- **Key Operations**: Classification, filtering, compression +- **Interfaces**: Storage abstraction, time provider + +#### File Shipping (`app/domain/shipper/`) + +- **Responsibility**: File upload orchestration +- **Key Operations**: URL allocation, upload management, error handling +- **Interfaces**: HTTP client, storage, metrics reporting + +#### Webhook Processing (`app/domain/webhook/`) + +- **Responsibility**: Kubernetes admission control +- **Key Operations**: Resource validation, metadata extraction, backfilling +- **Interfaces**: Kubernetes client, storage, certificate management + +#### Health Monitoring (`app/domain/healthz/`) + +- **Responsibility**: System health checks +- **Key Operations**: Component status, dependency validation +- **Interfaces**: External service checks + +## Adapter Layer + +### Configuration Adapters (`app/config/`) + +Handles external configuration sources: + +- **Gator Settings**: Core configuration management +- **Validator Config**: Validation-specific configuration +- **Webhook Config**: Admission controller configuration + +### HTTP Handlers (`app/handlers/`) + +External interface adapters: + +- **Remote Write**: Prometheus remote write endpoint +- **Webhook**: Kubernetes admission webhook endpoint +- **Metrics**: Prometheus metrics exposition +- **Profiling**: Debug and profiling endpoints + +### Storage Adapters (`app/storage/`) + +Data persistence abstractions: + +```mermaid +graph TB + subgraph "Storage Interfaces" + RESOURCE_STORE[Resource Store] + METRIC_STORE[Metric Store] + CONFIG_STORE[Config Store] + end + + subgraph "Concrete Implementations" + SQLITE[SQLite Driver] + DISK[Disk Storage] + PARQUET[Parquet Files] + end + + RESOURCE_STORE --> SQLITE + METRIC_STORE --> DISK + METRIC_STORE --> PARQUET + CONFIG_STORE --> DISK +``` + +#### Storage Implementations + +- **SQLite** (`app/storage/sqlite/`): Resource metadata persistence +- **Disk** (`app/storage/disk/`): File-based metrics storage +- **Core** (`app/storage/core/`): Base implementations and patterns + +### HTTP Client Adapters (`app/http/`) + +External service communication: + +- **CloudZero API Client**: Pre-signed URL allocation +- **S3 Upload Client**: File upload operations +- **Middleware**: Request/response processing, retry logic + +## Type System (`app/types/`) + +Shared types and interfaces that define contracts between layers: + +### Core Types + +- **Metric**: Prometheus metric representation +- **Resource**: Kubernetes resource metadata +- **Review**: Admission review structures +- **Storage Interfaces**: Repository patterns + +### Protocol Buffers + +- **Cluster Config**: Configuration message definitions +- **Status Reports**: Health and status reporting + +## Infrastructure Utilities (`app/utils/`) + +Supporting utilities that don't contain business logic: + +- **Clock**: Time abstraction for testing +- **Kubernetes Services**: K8s API helpers +- **Parallel Processing**: Concurrency utilities +- **File Locking**: Resource synchronization +- **Telemetry**: Observability helpers + +## Data Flow Architecture + +### Metrics Collection Flow + +```mermaid +sequenceDiagram + participant P as Prometheus + participant C as Collector + participant F as File System + participant S as Shipper + participant API as CloudZero API + participant S3 as S3 Storage + + P->>C: POST /api/v1/write + C->>C: Classify Metrics + C->>F: Write Compressed Files + F->>S: File Created Event + S->>API: Request Pre-signed URL + API->>S: Pre-signed URL + S->>S3: Upload File + S->>API: Confirm Upload +``` + +### Webhook Admission Flow + +```mermaid +sequenceDiagram + participant K as Kubernetes API + participant W as Webhook + participant DB as SQLite Store + participant B as Backfiller + + K->>W: Admission Review + W->>W: Extract Metadata + W->>DB: Store Resource Info + W->>K: Admission Response + + Note over B: Periodic Process + B->>DB: Query Resources + B->>DB: Update Relationships +``` + +## Deployment Architecture + +### Container Structure + +```mermaid +graph TB + subgraph "CloudZero Agent Pod" + COLLECTOR_CONTAINER[Collector Container] + SHIPPER_CONTAINER[Shipper Container] + SHARED_VOLUME[Shared Volume] + end + + subgraph "Webhook Pod" + WEBHOOK_CONTAINER[Webhook Container] + CERT_VOLUME[Certificate Volume] + end + + subgraph "Validator Job" + VALIDATOR_CONTAINER[Validator Container] + end + + COLLECTOR_CONTAINER --> SHARED_VOLUME + SHIPPER_CONTAINER --> SHARED_VOLUME + WEBHOOK_CONTAINER --> CERT_VOLUME +``` + +### Federated Mode (Large Clusters) + +```mermaid +graph TB + subgraph "Node 1" + AGENT1[Agent Instance 1] + METRICS1[Node Metrics 1] + end + + subgraph "Node 2" + AGENT2[Agent Instance 2] + METRICS2[Node Metrics 2] + end + + subgraph "Node N" + AGENTN[Agent Instance N] + METRICSN[Node Metrics N] + end + + subgraph "Shared Storage" + AGGREGATOR[Aggregator Service] + UPLOAD[Upload Service] + end + + AGENT1 --> AGGREGATOR + AGENT2 --> AGGREGATOR + AGENTN --> AGGREGATOR + AGGREGATOR --> UPLOAD +``` + +## Design Principles + +### Hexagonal Architecture Benefits + +1. **Testability**: Core domain isolated from infrastructure +2. **Flexibility**: Easy to swap adapters (SQLite → PostgreSQL) +3. **Maintainability**: Clear separation of concerns +4. **Deployment Options**: Same core, different deployment patterns + +### Key Patterns + +- **Repository Pattern**: Storage abstraction +- **Adapter Pattern**: External service integration +- **Observer Pattern**: File monitoring and events +- **Strategy Pattern**: Different upload strategies +- **Factory Pattern**: Configuration-driven component creation + +### Interface Design + +- All external dependencies are behind interfaces +- Domain layer depends only on abstractions +- Adapters implement domain interfaces +- Applications wire dependencies through dependency injection + +## Extension Points + +### Adding New Metrics Sources + +1. Implement new handler in `app/handlers/` +2. Add metric classification logic in domain layer +3. Update collector application wiring + +### Adding New Storage Backends + +1. Implement storage interfaces in `app/storage/` +2. Add configuration options +3. Update dependency injection in applications + +### Adding New Cloud Providers + +1. Extend scout utilities in `app/utils/scout/` +2. Add provider-specific configuration +3. Update webhook metadata collection + +This architecture enables the CloudZero Agent to handle diverse Kubernetes environments while maintaining clean separation between business logic and infrastructure concerns. diff --git a/docs/releases/0.0.1.md b/docs/releases/0.0.1.md deleted file mode 100644 index 5d2ab8ad..00000000 --- a/docs/releases/0.0.1.md +++ /dev/null @@ -1,7 +0,0 @@ -## 0.0.1 (2024-11-13) - -Initial release of the cloudzero-insights-controller - -### New Features - -- **Labels and Annotations:** Deploying this application as a target for validating webhook admission requests allows the gathering of labels and annotations from pods, deployments, statefulsets, daemonsets, jobs, cronjobs, nodes, and namespaces. diff --git a/docs/releases/0.0.2.md b/docs/releases/0.0.2.md deleted file mode 100644 index 17269d5d..00000000 --- a/docs/releases/0.0.2.md +++ /dev/null @@ -1,21 +0,0 @@ -## [0.0.2](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.0.2...v0.0.1) (2024-11-17) - -Various fixes and improvements, including more graceful retries and more granular control over what resources are gathered. - -### Upgrade Steps - -- N/A - -### New Features - -- **Granular Control of Resources Exported:** The types of Kubernetes resources for which labels and/or annotations are gathered can now be set per resource type. This improves performance on the initial scrape job. - -### Bug Fixes - -- **Context Deadline Issue Resolved:** A context timeout was causing timeout errors during the initial scrape job for clusters with relatively higher load. This context timeout was removed in favor of relying on the built-in kube client rate limiter. -- **Better Handling of Database Contention:** Processes that access the database should wait up to five seconds for the lock. - -### Improvements - -- **More Robust Retries for Remote Write:** Remote write requests to the CZ API use an exponential backoff for failed requests. -- **Faster Initial Scrape:** The initial scrape process is allowed to scrape data from the Kubernetes API at a faster rate. diff --git a/docs/releases/0.0.3.md b/docs/releases/0.0.3.md deleted file mode 100644 index 7fef5646..00000000 --- a/docs/releases/0.0.3.md +++ /dev/null @@ -1,19 +0,0 @@ -## [0.0.3](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.0.3...v0.0.2) (2024-12-11) - -Monitoring enhancements - -### Upgrade Steps - -- N/A - -### New Features - -- **New `/metrics` endpoint:** Introduced a new endpoint that exposes internal statistics in the Prometheus standard format. This allows for better integration with monitoring tools and provides insights into the application's performance and health. For more details, refer to the [documentation for available metrics](../statistics.md). - -### Bug Fixes - -- N/A - -### Improvements - -- N/A diff --git a/docs/releases/0.0.4.md b/docs/releases/0.0.4.md deleted file mode 100644 index 717a5046..00000000 --- a/docs/releases/0.0.4.md +++ /dev/null @@ -1,20 +0,0 @@ -## [0.0.4](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.0.4...v0.0.3) (2024-12-12) - -Security Policy Enhancements: This change adds reactive configuration updates when API secret or TLS Certificates changed. This allows transparent security policy enforcement, and prevents unnecessary restarts of the service after such events. - -### Upgrade Steps - -- N/A - -### New Features - -- **Automatic detection (and reconfiguration) of secrets rotation.** -- **Automatic detection (and reconfiguration) of TLS Certificate rotation.** - -### Bug Fixes - -- N/A - -### Improvements - -- N/A diff --git a/docs/releases/0.0.5.md b/docs/releases/0.0.5.md deleted file mode 100644 index 4adc12a0..00000000 --- a/docs/releases/0.0.5.md +++ /dev/null @@ -1,19 +0,0 @@ -## [0.0.5](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.0.5...v0.0.4) (2024-12-16) - -Add ability to control logging level for production environments. - -### Upgrade Steps - -- N/A - -### New Features - -- **Add log level support with info level default** - -### Bug Fixes - -- N/A - -### Improvements - -- N/A diff --git a/docs/releases/0.0.6.md b/docs/releases/0.0.6.md deleted file mode 100644 index 7bbd06e6..00000000 --- a/docs/releases/0.0.6.md +++ /dev/null @@ -1,25 +0,0 @@ -## [0.0.6](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.0.6...v0.0.4) (2024-12-16) - -- Add support for logging level -- Clean Cloud Account ID from configuration -- Ignore disabled resource types -- Various performance improvements -- Resolve CVE GHSA-w32m-9786-jp63 - -### Upgrade Steps - -- N/A - -### New Features - -- Add support for logging level -- Support disabling specific resource tag collection - -### Bug Fixes - -- Clean Cloud Account ID from configuration -- Resolve CVE GHSA-w32m-9786-jp63 - -### Improvements - -- Various performance improvements diff --git a/docs/releases/0.1.0.md b/docs/releases/0.1.0.md deleted file mode 100644 index fe88d015..00000000 --- a/docs/releases/0.1.0.md +++ /dev/null @@ -1,20 +0,0 @@ -## [0.1.0](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.1.0...v0.0.6) (2024-12-20) - -First primary release for the Labels and Annotations Support. - -### Upgrade Steps - -- N/A - -### New Features - -- N/A - -### Bug Fixes - -- N/A - -### Improvements - -- Improved Cloud Account parameter cleanup support -- Standarized Logging diff --git a/docs/releases/0.1.1.md b/docs/releases/0.1.1.md deleted file mode 100644 index 3b67c4a3..00000000 --- a/docs/releases/0.1.1.md +++ /dev/null @@ -1,22 +0,0 @@ -## [0.1.1](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.1.1...v0.0.4) (2025-01-09) - -First primary release for the Labels and Annotations Support. - -### Upgrade Steps - -- N/A - -### New Features - -- N/A - -### Bug Fixes - -- Fixed issue in which cascading context cancellations can result in some records not getting uploaded - -### Improvements - -- Improved Cloud Account parameter cleanup support -- Standarized Logging -- Improved backfill processing -- Internal metrics are exposed diff --git a/docs/releases/0.1.2.md b/docs/releases/0.1.2.md deleted file mode 100644 index b9a425aa..00000000 --- a/docs/releases/0.1.2.md +++ /dev/null @@ -1,21 +0,0 @@ -## [0.1.2](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.1.2...v0.1.1) (2025-03-03) - -Bug fixes for prometheus and excess logging - -### Upgrade Steps - -- N/A - -### New Features - -- N/A - -### Bug Fixes - -- Fixed issue where a prometheus metric label was not defined correctly -- Changed the log level for a few log messages - -### Improvements - -- Improved the clarify of the sqlite connection string -- Added a new test to ensure the sqlite database can handle concurrent reads diff --git a/docs/releases/0.2.0.md b/docs/releases/0.2.0.md deleted file mode 100644 index 88bf0acd..00000000 --- a/docs/releases/0.2.0.md +++ /dev/null @@ -1,8 +0,0 @@ -## [0.2.0](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.2.0...v0.1.2) (2025-03-18) - -- **CloudZero Aggregator**: The CloudZero Aggregator (affectionately known as "The Gator") is a new component that sits between the CloudZero Agent and the CloudZero Platform. The Gator aggregates metrics into a local cache before sending them in larger batches to the CloudZero Platform. This provides substantial improvements in reliability, performance, disaster recovery, user-friendliness, and more. - -### New Features - -- Collector application -- Shipper application diff --git a/docs/releases/0.2.1.md b/docs/releases/0.2.1.md deleted file mode 100644 index ddee7d3f..00000000 --- a/docs/releases/0.2.1.md +++ /dev/null @@ -1,20 +0,0 @@ -## [0.2.1](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.2.1...v0.2.0) (2025-03-28) - -This is primarily a bug fix release, but also contains a number of updates and -improvements. - -### Improvements - -- **Additional testing**: A substantial number of tests have been added. -- **More robust build process**: The application can now be built from a - directory named something other than "insights-controller". -- **Optional /debug/pprof/ endpoint**: If desired, you can enable profiling in - order to better understand the performance characteristics of the application. -- **Configuratble destination URL for insight-controller app**: The - `insight-controller` application now supports a configurable destination URL - for metrics. - -### Bug Fixes - -- **Updated update logic**: An issue in which records could be sent multiple - times is fixed. The logic for updating the `sent_at` time is simplified. diff --git a/docs/releases/1.1.0-beta-3.md b/docs/releases/1.1.0-beta-3.md deleted file mode 100644 index b06c45d2..00000000 --- a/docs/releases/1.1.0-beta-3.md +++ /dev/null @@ -1,7 +0,0 @@ -## [1.1.0-beta-3](https://github.com/Cloudzero/cloudzero-agent/compare/v0.2.1...v1.1.0-beta-3) (2025-04-08) - -This release centralizes all CloudZero Agent code into a single repository. - -For a complete list of changes, please refer to the [CloudZero Insights -Controller](https://github.com/Cloudzero/cloudzero-insights-controller/) and -[CloudZero Charts](https://github.com/Cloudzero/cloudzero-charts/) repositories. diff --git a/docs/releases/CHANGELOG-1.0.md b/docs/releases/CHANGELOG-1.0.md new file mode 100644 index 00000000..35741fe2 --- /dev/null +++ b/docs/releases/CHANGELOG-1.0.md @@ -0,0 +1,103 @@ +# CloudZero Agent 1.0.X - Changelog + +## Overview + +The 1.0.X release series introduced native Kubernetes **Labels** and **Annotations** support to the CloudZero platform, marking a major milestone in resource categorization and management capabilities. + +## Major Features + +### Kubernetes Labels and Annotations Support + +- **Native Integration**: Direct support for Kubernetes Labels and Annotations within the CloudZero platform +- **Enhanced Categorization**: Improved ability to categorize and manage resources based on Labels and Annotations +- **Dimension Identification**: Kubernetes dimensions can now be identified based on deployment Labels and Annotations + +### New Components + +- **Insights Controller**: New ValidatingAdmissionWebhook for recording created labels and annotations +- **Service Account Management**: New service account configuration for the Insights Controller +- **Certificate Management**: Integration with Jetstack.io "cert-manager" for TLS certificate handling + +## Configuration Changes + +### New Configuration Options + +- `cert-manager.enabled`: Deploy cert-manager (default: depends on environment) +- `serviceAccount.create`: Create service account (default: true) +- `insightsController.enabled`: Enable ValidatingAdmissionWebhook (default: true) +- `insightsController.labels.enabled`: Enable label collection (default: true) +- `insightsController.annotations.enabled`: Enable annotation collection (default: false) +- Label and annotation pattern filtering with regular expressions + +### API Key Management Changes + +- API key arguments moved to `global` section +- `apiKey` → `global.apiKey` +- `existingSecretName` → `global.existingSecretName` + +## Breaking Changes and Deprecations + +### Deprecated Components + +- **node-exporter**: Completely deprecated and no longer used +- **External kube-state-metrics**: Replaced with internal `cloudzero-state-metrics` instance + +### Configuration Breaking Changes + +- API key management arguments relocated to global section +- Some existing values no longer necessary in override configurations + +## Bug Fixes Across 1.0.X Series + +### 1.0.1 Fixes + +- Fixed webhook resource naming validation issues +- Resolved TLS certificate generation for webhook configuration changes +- Fixed invalid Prometheus metric label names causing panics +- Removed default Kubernetes logger usage for proper logging level respect + +### 1.0.2 Fixes + +- Template rendering improvements +- Enhanced certificate generation reliability + +## Improvements + +### Performance and Reliability + +- Shorter TTL for `init-cert` Job (5 seconds cleanup) +- Improved SQLite connection handling and testing +- Enhanced logging with appropriate debug/info level separation +- Improved validation and check results output + +### Documentation + +- Added comprehensive Istio cluster documentation +- Enhanced upgrade instructions and configuration examples +- Detailed security scan results and vulnerability reporting + +## Security + +### Vulnerability Status + +All images in 1.0.X series show zero critical, high, medium, low, or negligible vulnerabilities according to Grype security scans. + +## Upgrade Path + +To upgrade to any 1.0.X version: + +```bash +helm repo add cloudzero https://cloudzero.github.io/cloudzero-charts +helm repo update +helm upgrade --install cloudzero/cloudzero-agent -n --create-namespace -f configuration.yaml --version 1.0.X +``` + +## Version History + +- **1.0.0** (2025-02-17): Initial major release with Labels/Annotations support +- **1.0.1** (2025-03-02): Bug fixes for template rendering and TLS certificates +- **1.0.2**: Additional stability improvements + +--- + +_This changelog covers the major features and changes introduced in the CloudZero Agent 1.0.X release series._ diff --git a/docs/releases/CHANGELOG-1.1.md b/docs/releases/CHANGELOG-1.1.md new file mode 100644 index 00000000..b8e7c0bd --- /dev/null +++ b/docs/releases/CHANGELOG-1.1.md @@ -0,0 +1,107 @@ +# CloudZero Agent 1.1.X - Changelog + +## Overview + +The 1.1.X release series introduced the **CloudZero Aggregator**, a high-performance, cost-efficient replacement for the CloudZero OLTP metrics API, along with significant enhancements to reliability, performance, and user experience. + +## Major Features + +### CloudZero Aggregator + +- **High-Performance Collector**: Local aggregator application for enhanced telemetry processing +- **End-to-End Telemetry Tracking**: Complete visibility and traceability of telemetry files +- **Resilience During Key Rotation**: Seamless API key rotation with Multiple API Key feature +- **Direct S3 Upload**: Data uploads directly to dedicated customer S3 buckets +- **Improved Onboarding Feedback**: Faster deployment configuration feedback (typically 10 minutes) +- **Configurable Upload Intervals**: Flexible upload timing (default: 10 minutes) + +### Architecture Improvements + +- **Single Binary and Single Version**: Unified CloudZero image reference tagged to chart release +- **Simplified Image Management**: Streamlined image mirroring, versioning, and operational identification +- **Consolidated Webhooks**: Multiple Kubernetes validating webhooks merged into single webhook + +## Performance Enhancements + +### Monitoring and Metrics + +- **Improved Scrape Frequency**: Metrics captured every 1 minute (previously 2 minutes) +- **Greater Granularity**: Enhanced monitoring precision and faster actionable insights +- **Enhanced HTTP Error Logging**: Improved debugging and monitoring efficiency + +### Configuration and Deployment + +- **New Configuration API**: Minimalistic Helm values/overrides API for future compatibility +- **Automatic DNS Configuration**: Auto-generated DNS configuration and priority class settings +- **Enhanced Disk Management**: Configurable disk monitoring and improved space management +- **Pod Disruption Budgets**: Added for higher availability during maintenance + +## Reliability and Stability + +### Enhanced Functionality + +- **Improved Labels and Annotations**: Better performance for backfilling operations +- **Enhanced Debug Logging**: Debug logging for abandoned file IDs in shipper component +- **Reduced Complexity**: Consolidated webhooks reduce operational complexity + +### Compatibility Improvements (1.1.1) + +- **Extended Kubernetes Support**: Reduced requirement from 1.23 to 1.21 +- **Expanded Installation Compatibility**: Support for clusters back to mid-2022 EOL versions +- **Reduced Permissions**: Removed patch permission requirement on deployments for cert initialization + +## Bug Fixes Across 1.1.X Series + +### 1.1.0 Fixes + +- **Duplicate Affinity**: Resolved affinity settings duplication in insights deployment +- **Agent Termination**: Fixed hang issue during agent termination in redeployments +- **Validation Improvement**: Enhanced validation and check results output + +### 1.1.1 Fixes + +- **Shipper Directory Creation**: Recursive subdirectory creation prevents restart failures +- **Dependency Updates**: Various dependency updates for improved stability + +### 1.1.2 Fixes + +- Additional maintenance and stability improvements + +## Documentation and User Experience + +### Comprehensive Documentation + +- **API Scopes Guidance**: Detailed required API scopes for Kubernetes agent +- **Smoother Onboarding**: Enhanced configuration guidance +- **Operational Identification**: Clearer versioning and image identification + +### Configuration Flexibility + +- **Configurable Components**: Images, labels, annotations, tolerations, affinities +- **Node Selectors**: Flexible node selector configuration +- **Priority Classes**: Configurable priority class settings +- **DNS Settings**: Enhanced DNS configuration options + +## Breaking Changes + +- **OLTP API Replacement**: CloudZero Aggregator replaces OLTP metrics API +- **Webhook Consolidation**: Multiple webhooks consolidated into single webhook +- **Configuration Changes**: New configuration API may require override file updates + +## Upgrade Path + +To upgrade to any 1.1.X version: + +```bash +helm upgrade --install cloudzero/cloudzero-agent -n --create-namespace -f configuration.example.yaml --version 1.1.X +``` + +## Version History + +- **1.1.0** (2025-04-29): Major release with CloudZero Aggregator +- **1.1.1** (2025-05-02): Maintenance release expanding compatibility +- **1.1.2**: Additional stability and maintenance improvements + +--- + +_This changelog covers the major features and changes introduced in the CloudZero Agent 1.1.X release series._ diff --git a/docs/releases/CHANGELOG-1.2.md b/docs/releases/CHANGELOG-1.2.md new file mode 100644 index 00000000..4dfeb698 --- /dev/null +++ b/docs/releases/CHANGELOG-1.2.md @@ -0,0 +1,141 @@ +# CloudZero Agent 1.2.X - Changelog + +## Overview + +The 1.2.X release series introduced **Federated Mode** support, comprehensive **Helm Schema Validation**, enhanced **Configuration Management**, and numerous improvements to observability, reliability, and maintainability. + +## Major Features + +### Federated Mode Support + +- **Large Cluster Support**: Agent deployment optimized for large cluster environments +- **Node-Level Deployment**: Agent runs on each node instead of single agent for all nodes +- **Configuration**: Enable with `defaults.federation.enabled: true` + +### Enhanced Configuration Management + +- **Comprehensive Helm Schema Validation**: Extended JSON Schema validation covering entire configuration +- **Early Feedback**: Earlier detection and reporting of configuration issues +- **Configuration ConfigMap**: Complete Helm chart configuration stored in ConfigMap for easier debugging + +### New Tools and Utilities + +- **CloudZero Helmless Tool**: Shows minimal differences between default and actual configuration +- **Minimized Overrides**: Recreates minimized override files +- **Helmless Job**: Helm job for easy minimal configuration override determination + +### Configuration Automation + +- **Cloud Provider Detection**: Automatic CSP metadata detection through scout package integration +- **Configuration Simplification**: Eliminates manual cloudAccountId and region setup requirements +- **Multi-Cloud Support**: AWS and Google Cloud environment auto-detection across all applications +- **Zero-Configuration Deployment**: Enhanced collector, shipper, and webhook applications with automatic environment detection + +## Performance and Efficiency Improvements + +### Load Balancing and Connectivity + +- **Improved Load Balancing**: Enhanced HTTP connection handling +- **Periodic Connection Rotation**: Proper load distribution across service replicas +- **Multi-Replica Support**: Optimized for multi-replica deployments + +### Storage Optimization + +- **Reduced Storage Usage**: Metric files stored for 7 days (previously 90 days) +- **Significant Storage Reduction**: Dramatically reduced storage requirements +- **Cost Efficiency**: Lower operational costs through optimized retention + +### Configuration Automation + +- **Reduced Manual Setup**: Scout package eliminates need for manual cloudAccountId and region configuration +- **Faster Deployment**: Automatic environment detection speeds up installation process +- **Cluster Name Detection**: Google Cloud environments support automatic cluster name discovery + +## Observability and Debugging + +### Enhanced Logging + +- **Configurable Prometheus Log Levels**: Flexible logging configuration +- **Reduced Log Noise**: Health checks moved to trace level +- **Positive Confirmation Logging**: Regular info-level messages confirm proper operation +- **Improved Debugging**: Better visibility into agent operations + +### Monitoring Improvements + +- **Enhanced Error Handling**: Better error reporting and debugging capabilities +- **Operational Visibility**: Improved insight into agent performance and health + +## Reliability and Bug Fixes + +### Major Bug Fixes Across 1.2.X Series + +#### 1.2.0 Fixes + +- **Eliminate Unnecessary Replays**: Fixed shipper file replay issues +- **Out-of-Order Metrics**: Added configuration window for out-of-order metric acceptance (default: 5 minutes) + +#### 1.2.1 Fixes + +- **Subchart Schema Validation**: Fixed JSON Schema validation for subchart usage +- **Global Property Support**: Resolved Helm subchart global property validation errors + +#### 1.2.2 Fixes + +- **Configuration Management**: Fixed component-specific configuration merging issues +- **ConfigMap References**: Updated ConfigMap name references to correct naming convention +- **Resource Lookup Failures**: Prevented resource lookup failures +- **Template Generation**: Fixed invalid Kubernetes resource template generation +- **Label Filtering**: Aggregator no longer filters out "resource_type" and "workload" labels + +## Security and Availability + +### Enhanced Security + +- **Default Pod Disruption Budgets**: Improved availability during disruptions +- **Schema Validation**: Comprehensive validation prevents configuration errors +- **Resource Protection**: Better resource naming and reference handling + +### Testing and Quality Assurance + +- **Subchart Testing**: Comprehensive test coverage for subchart scenarios +- **Regression Prevention**: Tests prevent schema validation regression +- **Resource Creation Verification**: Checks ensure successful Kubernetes resource creation +- **Template Validation**: Kubeconform tests validate generated templates + +## Development and Maintenance + +### Code Quality Improvements + +- **Helmless Tool Enhancement**: Split implementation with enhanced testing coverage +- **Unnecessary Functionality Removal**: Cleaned up unused code paths +- **Testing Infrastructure**: Improved validation and testing frameworks + +### Dependency Management + +- **Updated Dependencies**: Regular dependency updates for security and stability +- **Component Isolation**: Better separation of concerns in tool implementations + +## Breaking Changes + +- **Storage Retention**: Default metric file retention reduced from 90 to 7 days +- **Schema Validation**: Stricter validation may catch previously ignored configuration errors +- **ConfigMap Naming**: Updated naming conventions may affect existing references + +## Upgrade Path + +To upgrade to any 1.2.X version: + +```bash +helm upgrade --install cloudzero/cloudzero-agent -n --create-namespace -f configuration.example.yaml --version 1.2.X +``` + +## Version History + +- **1.2.0** (2025-06-05): Major release with Federated Mode and Schema Validation +- **1.2.1** (2025-06-17): Bugfix release for subchart schema validation +- **1.2.2** (2025-06-24): Maintenance release with configuration and template fixes +- **1.2.3** (2025-06-27): Major release with Configuration Automation and multi-cloud auto-detection + +--- + +_This changelog covers the major features and changes introduced in the CloudZero Agent 1.2.X release series._ diff --git a/docs/releases/RELEASE_PROCESS.md b/docs/releases/RELEASE_PROCESS.md index 4f5eff5d..eeaff1bb 100644 --- a/docs/releases/RELEASE_PROCESS.md +++ b/docs/releases/RELEASE_PROCESS.md @@ -1,268 +1,397 @@ -# Release Process +# CloudZero Agent Release Process -This guide outlines the steps and best practices for managing releases in the repository. Following this process ensures consistency, quality, and clear communication with all stakeholders, including necessary external approvals. +This guide outlines the steps and best practices for managing releases of the CloudZero Agent. The process uses centralized changelog files and automated workflows to ensure consistency, quality, and proper chart mirroring to the cloudzero-charts repository. ## Overview -1. **Create Release Document** -2. **Submit Pull Request (PR)** -3. **Review and Merge** -4. **Trigger Manual Release Workflow** -5. **Obtain External Approvals** -6. **Publish Release**, **IMPORANT as the release is published only as a `pre-release` due to actions workflow limitations\_** +The CloudZero Agent release process has been streamlined to use centralized changelog files instead of individual release notes: + +1. **Generate Changelog** +2. **Review Changelog** +3. **Trigger Manual Release Workflow** +4. **Automated Chart Mirroring** +5. **Publish Release** --- ## Step-by-Step Process -### 1. Create a New Release Document +### 1. Generate Changelog + +Use the automated changelog generation tool to create or update the changelog file: + +```bash +# Generate changelog for version 1.2.3 (will update CHANGELOG-1.2.md) +TAG_VERSION=1.2.3 make generate-changelog +``` + +**What this does:** + +- Analyzes git commits since the last release +- Extracts user-facing changes, bug fixes, and new features +- Updates or creates `docs/releases/CHANGELOG-X.Y.md` file +- Follows the established changelog format used across the project + +**Changelog Location:** + +- **Directory:** `docs/releases/` +- **Filename Format:** `CHANGELOG-X.Y.md` (e.g., `CHANGELOG-1.2.md`) +- **Content:** Version-specific sections within the changelog file + +### 2. Review and Commit Changelog + +Review the generated changelog for accuracy and completeness: + +```bash +# Review the generated changelog +git diff docs/releases/CHANGELOG-*.md + +# Make any necessary edits to improve clarity or add context +# Commit the changelog +git add docs/releases/CHANGELOG-*.md +git commit -m "Update changelog for version 1.2.3" +git push origin develop +``` -- **Location:** [docs/releases](.) -- **Filename Format:** `X.X.X.md` (e.g., `1.2.3.md`) -- **Template:** Use the provided [Release Notes Template](#release-notes-template) below. +**Review Guidelines:** -**Instructions:** +- Ensure user-facing language is clear and non-technical +- Verify all major features and breaking changes are captured +- Check that version sections are properly formatted +- Validate that grouped changes are logically organized -- Duplicate the release notes template. -- Replace placeholders with the appropriate version number and details. -- Save the file with the correct naming convention in the `docs/releases` folder. +### 3. Trigger Manual Release Workflow -### 2. Open a Pull Request (PR) +Navigate to GitHub Actions and trigger the release workflow: -- **Target Branch:** `develop` -- **PR Title:** `Release X.X.X - [Brief Description]` +- **Workflow:** "Manual Prepare Release" +- **Location:** `Actions > Manual Prepare Release` +- **Input Required:** Version number (e.g., `1.2.3`) -**Automations:** +**What the workflow does:** -- Opening a PR will automatically notify the **Product Management (PM)** and **Documentation** teams for review. +1. **Validates:** Confirms the required changelog file exists (`docs/releases/CHANGELOG-1.2.md`) +2. **Updates:** Helm chart version references and regenerates templates +3. **Merges:** `develop` branch into `main` with fast-forward merge +4. **Tags:** Creates git tag `v1.2.3` +5. **Extracts:** Release notes from the changelog file automatically +6. **Creates:** GitHub release (as draft) with extracted content -### 3. Review and Merge the PR +### 4. Automated Chart Mirroring -- **Reviewers:** PM Team, Documentation Team -- **Approval:** Obtain necessary approvals from the reviewers. -- **Merge:** Once approved, merge the release notes PR into the `develop` branch. +The chart mirroring process runs automatically on every push to `develop`: -### 4. Trigger the Manual Release Workflow +**Mirror Workflow (`mirror-chart.yml`):** -- **Workflow Link:** [Manual Release Workflow](https://github.com/Cloudzero/cloudzero-insights-controller/actions/workflows/release-to-main.yml) +- **Triggers:** Automatically on push to `develop` branch +- **Syncs:** `helm/` directory to `cloudzero-charts/charts/cloudzero-agent/` +- **Includes:** Changelog files are now synced to charts repository +- **Preserves:** Commit history and authorship information -**Purpose:** +**Chart Repository Structure:** -- Ensures stakeholders review the functionality and public-facing documentation before publishing. -- **External Approvals Required:** Designated stakeholders must manually approve the release. +``` +cloudzero-charts/ +└── charts/ + └── cloudzero-agent/ + ├── templates/ # Helm templates + ├── values.yaml # Chart values + ├── Chart.yaml # Chart metadata + └── docs/ + └── releases/ # Centralized changelog files (mirrored from docs/releases/) + ├── CHANGELOG-1.0.md + ├── CHANGELOG-1.1.md + ├── CHANGELOG-1.2.md + └── RELEASE_PROCESS.md +``` -**Steps:** +**Key Changes**: -1. Navigate to the **Actions** tab in your repository. -2. Select the **Manual Release** workflow. -3. Trigger the workflow and follow any on-screen instructions. -4. **Await Stakeholder Approval:** Stakeholders will receive a notification to review and approve the release. +- **Single Source**: `docs/releases/` is the authoritative location for all release documentation +- **No Duplication**: `helm/docs/releases/` legacy files are excluded from mirroring +- **Complete Sync**: Entire `docs/releases/` directory (including RELEASE_PROCESS.md) is mirrored to charts repo -### 5. Obtain External Approvals +### 5. Publish Release -- **Stakeholders Involved:** Product Management, Documentation, and any other designated external parties. -- **Approval Process:** - - Stakeholders review the functionality, documentation, and overall release readiness. - - Upon satisfaction, stakeholders provide manual approval through the workflow interface. +After the release workflow completes: -**Note:** The release cannot proceed to publication without obtaining these external approvals. +1. **Review Draft Release:** Navigate to GitHub Releases and review the draft +2. **Verify Content:** Ensure release notes are properly extracted from changelog +3. **Publish Release:** Remove draft status to publish the release -### 6. Publish the Release +**Post-Publication:** -- Once the manual release workflow is approved and completed, the release will be published only as a `pre-release`. -- _**You must go to the release and remove the `pre-release` flag. This will trigger the container image publish.**_ -- Stakeholders will be notified upon successful publication. +- Container images are automatically built and published +- Charts repository is updated with latest changes +- Release notifications are sent to watchers --- -## Creating Release Notes +## Changelog Format and Guidelines -Effective release notes provide clear and concise information about the changes in each release. Follow these guidelines to create comprehensive release notes. +The CloudZero Agent uses centralized changelog files following a consistent format. The automated changelog generation tool analyzes git commits and creates well-structured changelogs. -### Pro-Tip +### Changelog Structure -- **Review Changes:** Examine the GitHub diff between the previous and current release to identify all changes. -- **Commit Messages:** Use commit titles to outline the changes and categorize them appropriately. Include a link to the relevant commit where possible. +Changelog files follow this standardized format: -### Release Notes Structure +````markdown +# CloudZero Agent X.Y.Z - Changelog -Use the following sections to organize your release notes: +## Overview -1. **Upgrade Steps** -2. **Breaking Changes** -3. **New Features** -4. **Bug Fixes** -5. **Improvements** -6. **Other Changes** +Brief summary of the release series and major themes. -#### Upgrade Steps +## Major Features -- **Purpose:** Detail any actions users must take to upgrade beyond updating dependencies. -- **Content:** - - Step-by-step instructions for the upgrade. - - Pseudocode or code snippets highlighting necessary changes. - - Recommendations to upgrade due to known issues in older versions. -- **Note:** Ideally, no upgrade steps are required. +### Feature Name -#### Breaking Changes +- **Key Capability**: Description of what it does for users +- **Benefits**: How it helps users +- **Configuration**: Any relevant configuration details -- **Purpose:** List all breaking changes that may affect users. -- **Content:** - - Comprehensive list of changes that are not backward compatible. - - Typically included in major version releases. -- **Note:** Aim to minimize breaking changes. +## Performance Enhancements -#### New Features +- **Improvement Description**: Include metrics when available +- **Enhanced Functionality**: Details about optimizations -- **Purpose:** Describe new functionalities introduced in the release. -- **Content:** - - Detailed descriptions of each new feature. - - Usage scenarios and benefits. - - Include screenshots or diagrams where applicable. - - Mention any caveats, warnings, or if the feature is in beta. +## Bug Fixes Across X.Y.Z Series -#### Bug Fixes +### X.Y.0 Fixes -- **Purpose:** Highlight fixes for existing issues. -- **Content:** - - Description of the issues that have been resolved. - - Reference to related features or functionalities. +- **Issue Description**: Clear explanation of what was fixed +- **Resolution**: How the issue was resolved -#### Improvements +## Breaking Changes -- **Purpose:** Outline enhancements made to existing features or workflows. -- **Content:** - - Performance optimizations. - - Improved logging or error messaging. - - Enhancements to user experience. +- **Change Description**: Impact and migration requirements -#### Other Changes +## Upgrade Path -- **Purpose:** Capture miscellaneous changes that do not fit into the above categories. -- **Content:** - - Minor updates or maintenance tasks. - - Documentation updates. -- **Note:** Aim to keep this section empty by categorizing changes appropriately. +```bash +helm upgrade --install cloudzero/cloudzero-agent -f configuration.yaml --version X.Y.Z +``` +```` ---- +## Version History -### Release Notes Template +- **X.Y.0** (YYYY-MM-DD): Initial release description +- **X.Y.1** (YYYY-MM-DD): Maintenance release description -Copy and paste the following template to create your release notes. Replace placeholders with relevant information. +```` -```markdown -## [X.X.X](https://github.com/Cloudzero/cloudzero-insights-controller/compare/vX.X.X-1...vX.X.X) (YYYY-MM-DD) +### Automated Changelog Generation -> Brief description of the release. +The `make generate-changelog` command: -### Upgrade Steps +1. **Analyzes Commits**: Reviews git history since last release tag +2. **Extracts Changes**: Identifies user-facing changes, features, and fixes +3. **Categorizes Content**: Groups changes by type (features, fixes, improvements) +4. **Formats Output**: Follows existing changelog structure and style +5. **User-Focused Language**: Emphasizes benefits and impact to users -- [ACTION REQUIRED] -- Detailed upgrade instructions or steps. +### Manual Changelog Enhancement -### Breaking Changes +After generation, developers should: -- Description of breaking change 1. -- Description of breaking change 2. +1. **Review Accuracy**: Ensure all major changes are captured +2. **Improve Clarity**: Rewrite technical language for user-friendly descriptions +3. **Add Context**: Include configuration examples and upgrade guidance +4. **Verify Formatting**: Check markdown structure and section organization -### New Features +--- -- **Feature Name:** Detailed description, usage scenarios, and any relevant notes or images. -- **Feature Name:** Detailed description, usage scenarios, and any relevant notes or images. +## Workflow Details -### Bug Fixes +### Release Workflow (`release-to-main.yml`) -- **Bug Fix Description:** Explanation of the issue and how it was resolved. -- **Bug Fix Description:** Explanation of the issue and how it was resolved. +The manual release workflow performs these automated steps: -### Improvements +1. **Validation Phase**: + ```bash + # Validates that the required changelog exists + MINOR_VERSION=$(echo "1.2.3" | cut -d. -f1,2) + test -f "docs/releases/CHANGELOG-${MINOR_VERSION}.md" +```` -- **Improvement Description:** Details about the enhancement. -- **Improvement Description:** Details about the enhancement. +2. **Version Update Phase**: -### Other Changes + - Updates Helm chart image versions + - Regenerates chart templates and tests + - Commits version changes to `develop` -- **Change Description:** Brief explanation of the change. -- **Change Description:** Brief explanation of the change. -``` +3. **Release Phase**: -**Example:** + - Fast-forward merges `develop` into `main` + - Creates git tag (e.g., `v1.2.3`) + - Extracts release notes from changelog file + - Creates GitHub draft release -```markdown -## [0.29.0](https://github.com/Cloudzero/cloudzero-insights-controller/compare/v0.27.0...v0.29.0) (2024-11-07) +4. **Release Notes Extraction**: + ```bash + # Automatically extracts version-specific content + awk '/^## .*1.2.3/ { found=1; next } /^## / && found { exit } found { print }' CHANGELOG-1.2.md + ``` -> This release includes performance improvements and minor changes to documentation. +### Chart Mirroring Workflow (`mirror-chart.yml`) -### Performance Improvements +Automatically syncs changes to the charts repository: -- **dependencies:** Bump dependencies to latest versions for improved security and performance. [4a4ee13](https://github.com/Cloudzero/cloudzero-insights-controller/commit/4a4ee13) +1. **Trigger**: Every push to `develop` branch +2. **Sync Operations**: + - Mirrors `helm/` directory to `cloudzero-charts/charts/cloudzero-agent/` + - Copies changelog files to chart's `docs/releases/` directory + - Preserves commit authorship and history +3. **Result**: Charts repository stays synchronized with latest changes -### Other Changes +### Automation Benefits -- **chore(conventionalChangelog):** Add Conventional Changelog for automated release notes generation. [aafcdd9](https://github.com/Cloudzero/cloudzero-insights-controller/commit/aafcdd9) -- **docs(CHANGELOG):** Add changelog documentation. [e2c7435](https://github.com/Cloudzero/cloudzero-insights-controller/commit/e2c7435) -``` +- **Consistency**: Standardized changelog format across all releases +- **Efficiency**: Automated extraction eliminates manual copy-paste errors +- **Traceability**: Single source of truth for release information +- **Integration**: Seamless sync between repositories --- ## Best Practices -- **Consistency:** Maintain a consistent format and structure across all release notes. -- **Clarity:** Use clear and concise language to describe changes. -- **Categorization:** Properly categorize changes to make it easier for users to find relevant information. -- **Visual Aids:** Include screenshots or diagrams for new features to enhance understanding. -- **Review:** Ensure all sections are thoroughly reviewed by relevant teams before publishing. -- **Highlight External Approvals:** Clearly indicate when external approvals are required and obtained. +### Changelog Management + +- **Single Source of Truth**: Use centralized changelog files instead of individual release notes +- **Automated Generation**: Leverage `make generate-changelog` for consistency +- **User-Focused Language**: Write for users, not developers - emphasize benefits and impact +- **Version Organization**: Group related changes by minor version series (1.2.x) +- **Regular Updates**: Update changelogs incrementally rather than at release time + +### Release Coordination + +- **Early Preparation**: Generate changelog files well before release deadlines +- **Stakeholder Review**: Allow time for team review of generated changelogs +- **Testing Integration**: Ensure release process includes full testing suite +- **Documentation Sync**: Verify charts repository receives updated documentation + +### Quality Assurance + +- **Validation**: Use automated workflow validation to catch issues early +- **Content Review**: Manually review automated changelog generation for accuracy +- **Format Consistency**: Follow established changelog structure and formatting +- **Link Verification**: Ensure all references and links are functional --- -## Roles and Responsibilities +## Troubleshooting + +### Common Issues + +**Changelog File Missing**: + +```bash +# Error: test -f "docs/releases/CHANGELOG-1.2.md" fails +# Solution: Generate the changelog first +TAG_VERSION=1.2.3 make generate-changelog +``` + +**Release Notes Extraction Issues**: + +```bash +# Problem: No content extracted from changelog +# Cause: Version section not found in changelog +# Solution: Ensure changelog has proper version headers (## 1.2.3) +``` + +**Chart Mirroring Delays**: + +- Check that `develop` branch is up to date +- Verify mirror workflow completed successfully +- Confirm cloudzero-charts repository permissions + +**Workflow Permissions**: -- **Developer:** Drafts the release notes using the provided template. -- **Product Management (PM) Team:** Reviews and approves the release notes for accuracy and completeness. -- **Documentation Team:** Ensures that the release notes are well-documented and user-friendly. -- **Stakeholders:** Review and provide external approval for the release through the Manual Release workflow. -- **Release Manager:** Oversees the release process, ensuring all steps, including external approvals, are completed. +- Ensure `VERSION_BUMP_DEPLOY_KEY` secret is configured +- Verify `CLOUDZERO_CHARTS_DEPLOY_KEY` secret exists +- Check repository permissions for workflow execution + +### Recovery Procedures + +**Failed Release Workflow**: + +1. Review workflow logs for specific error +2. Fix underlying issue (changelog, permissions, etc.) +3. Re-run workflow with same version number +4. Verify all steps complete successfully + +**Missing Chart Updates**: + +1. Manually trigger mirror workflow if needed +2. Verify chart repository has latest changes +3. Confirm changelog files are properly synced --- -## External Approvals +## Migration from Legacy Process + +### Key Changes from Previous Process -External approvals are a critical part of the release process to ensure that all stakeholders are aligned and that the release meets quality and functionality standards. +**Before (Legacy)**: -### Approval Workflow +- Individual `helm/docs/releases/{version}.md` files +- Manual creation of release notes +- Separate chart and agent documentation -1. **Initiate Approval:** +**After (Current)**: - - After triggering the Manual Release workflow, stakeholders receive a notification to review the release. +- Centralized `docs/releases/CHANGELOG-{minor}.md` files +- Automated changelog generation with manual review +- Synchronized documentation across repositories -2. **Review Process:** +### Migration Steps for Existing Releases - - Stakeholders evaluate the functionality, documentation, and overall readiness of the release. - - Any feedback or required changes are communicated back to the release manager or developer. +1. **Consolidate Existing Notes**: Move individual release files into appropriate changelog files +2. **Update Workflows**: Ensure workflows reference changelog files instead of individual notes +3. **Clean Up Legacy Files**: Remove `helm/docs/releases/` directory to eliminate duplication +4. **Team Training**: Educate team on new `make generate-changelog` process +5. **Documentation Update**: Update all references to point to new process -3. **Provide Approval:** +### Cleanup Strategy - - Once satisfied, stakeholders provide manual approval within the workflow interface. - - The release process proceeds to publication upon receiving all necessary approvals. +**Remove Legacy Release Files**: -4. **Handling Rejections:** - - If a stakeholder rejects the release, the release manager must address the feedback and possibly iterate on the release notes or code before resubmitting for approval. +```bash +# The helm/docs/releases/ directory can be safely removed +# All content has been consolidated into docs/releases/CHANGELOG-*.md files +rm -rf helm/docs/releases/ -**Note:** The release cannot be published until all required external approvals are obtained. +# Update any documentation that references the old location +# All chart documentation now sources from docs/releases/ +``` + +**Benefits of Consolidation**: + +- **Single Source of Truth**: Only `docs/releases/` contains release documentation +- **Reduced Maintenance**: No need to maintain duplicate files +- **Automatic Sync**: Charts repository gets complete release documentation +- **Simplified Workflow**: One location for all release-related files --- -## FAQs +## Integration with External Systems + +### GitHub Releases + +- Release content automatically extracted from changelog files +- Draft releases created for review before publication +- Tag creation and branch management fully automated + +### CloudZero Charts Repository -**Q: What if there are no changes for a minor release?** -A: Even if there are no significant changes, create a release document indicating that it is a maintenance or patch release. +- Automatic mirroring of helm chart and changelog files +- Preservation of commit history and authorship +- Seamless integration without manual intervention -**Q: How often should releases be made?** -A: Follow the project's release cadence, whether it's weekly, bi-weekly, monthly, etc., to ensure regular updates. +### CI/CD Pipeline -**Q: Who approves the final release?** -A: Designated stakeholders must manually approve the release through the Manual Release workflow. +- Integration with existing build and test processes +- Validation of changelog files before release +- Automated image building and publishing upon release -**Q: What happens if external approvals are not obtained in a timely manner?** -A: Communicate with stakeholders to address any blockers and ensure that the release schedule accommodates the approval process. +This updated process provides a streamlined, automated approach to releases while maintaining quality and consistency across the CloudZero Agent ecosystem.