Add docs

- Add Readme to project with basic install and usage - Add API documentation - Add Helm chart documentation Signed-off-by: adrianc <[email protected]>
Mellanox · Sep 4, 2024 · ab32138 · ab32138
1 parent 39f04ef
commit ab32138
Show file tree

Hide file tree

Showing 14 changed files with 930 additions and 11 deletions.
diff --git a/Makefile b/Makefile
@@ -221,6 +221,18 @@ golangci-lint:
 	curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell dirname $(GOLANGCI_LINT)) $(GOLANGCI_LINT_VERSION) ;\
 	}
 
+GEN_CRD_API_REFERENCE_DOCS = $(LOCALBIN)/gen-crd-api-reference-docs
+.PHONY: gen-crd-api-reference-docs ## Download gen-crd-api-reference-docs locally if necessary
+gen-crd-api-reference-docs: $(GEN_CRD_API_REFERENCE_DOCS)
+$(GEN_CRD_API_REFERENCE_DOCS): | $(LOCALBIN)
+	@ GOBIN=$(LOCALBIN) go install github.com/ahmetb/gen-crd-api-reference-docs@latest
+
+HELM_DOCS = $(LOCALBIN)/helm-docs
+HELM_DOCS_VERSION ?= v1.14.2
+.PHONY: helm-docs ## Download helm-docs locally if necessary
+helm-docs: $(HELM_DOCS)
+$(HELM_DOCS): | $(LOCALBIN)
+	@ GOBIN=$(LOCALBIN) go install github.com/norwoodj/helm-docs/cmd/helm-docs@$(HELM_DOCS_VERSION)
 ##@ General
 
 # The help target prints out all targets with their descriptions organized
@@ -281,6 +293,20 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
 generate-mocks: mockery ## generate mock objects
 	PATH=$(LOCALBIN):$(PATH) go generate ./...
 
+
+.PHONY: generate-api-docs
+generate-api-docs: gen-crd-api-reference-docs ## generate api documentation
+	$(GEN_CRD_API_REFERENCE_DOCS) -api-dir=./api/v1alpha1 -config=${CURDIR}/hack/api-docs/config.json \
+	-template-dir=${CURDIR}/hack/api-docs/templates -out-file=$(BUILDDIR)/api-reference.html
+	$(CONTAINER_TOOL) run --rm --volume "`pwd`:/data:Z" pandoc/minimal -f html -t markdown_strict \
+	--columns 200 /data/build/api-reference.html -o /data/docs/api-reference.md
+	chmod a+w docs/api-reference.md
+	rm $(BUILDDIR)/api-reference.html
+
+.PHONY: generate-helm-docs
+generate-helm-docs: helm-docs ## generate helm documentation
+	cd deployment/maintenance-operator-chart && $(HELM_DOCS)
+
 ##@ Build
 
 .PHONY: build

diff --git a/README.md b/README.md
@@ -5,7 +5,177 @@
 [![CodeQL](https://github.com/Mellanox/maintenance-operator/actions/workflows/codeql.yml/badge.svg)](https://github.com/Mellanox/maintenance-operator/actions/workflows/codeql.yml)
 [![Image push](https://github.com/Mellanox/maintenance-operator/actions/workflows/image-push-main.yml/badge.svg?event=push)](https://github.com/Mellanox/maintenance-operator/actions/workflows/image-push-main.yml)
 
-# Nvidia Maintenance Operator
-coordinates node maintenance operations in K8s cluster
+# NVIDIA Maintenance Operator
 
-> __NOTE__: This project is currently under active development.
+NVIDIA Maintenance Operator provides Kubernetes API(Custom Resource Definition) to allow node maintenance operators in K8s cluster
+in a coordinated manner. It performs some common operations to prepare a node for maintenance such as cordoning
+the node as well as draining it.
+
+Users/Consumers can request to perform maintenance on a node by creating NodeMaintenance Custom Resource(CR).
+The operator will then reconcile NodeMaintenance CRs. At high level this the the reconcile flow:
+
+1. Scheduling - schedule NodeMaintenance to be processed by the operator, taking into account constraints
+  such as the maximal allowed parallel operations.
+2. Node preparation for maintenance such as cordon and draning of the node
+3. Mark NodeMaintenance as Ready (via condition)
+4. Cleanup on deletion of NodeMaintenance such as node uncordon
+
+## Deployment
+
+### Prerequisites
+
+* Kubernetes cluster
+
+### Helm
+
+#### Deploy latest from project sources
+
+```bash
+# Clone project
+git clone https://github.com/Mellanox/maintenance-operator.git ; cd maintenance-operator
+
+# Install Operator
+helm install -n maintenance-operator --create-namespace --set operator.image.tag=latest maintenance-operator ./deployment/maintenance-operator-chart
+
+# View deployed resources
+kubectl -n maintenance-operator get all
+```
+
+> [!NOTE]
+> Refer to [helm values documentation](deployment/maintenance-operator-chart/README.md) for more information
+
+#### Deploy last release from OCI repo
+
+```bash
+helm install -n maintenance-operator --create-namespace maintenance-operator oci://ghcr.io/mellanox/maintenance-operator-chart
+```
+
+### Kustomize (for development)
+
+```bash
+# clone project
+git clone https://github.com/Mellanox/maintenance-operator.git ; cd maintenance-operator
+
+# build image
+IMG=harbor.mellanox.com/cloud-orchestration-dev/adrianc/maintenance-operator:latest make docker-build
+
+# push image
+IMG=harbor.mellanox.com/cloud-orchestration-dev/adrianc/maintenance-operator:latest make docker-push
+
+# deploy
+IMG=harbor.mellanox.com/cloud-orchestration-dev/adrianc/maintenance-operator:latest make deploy
+
+# undeploy
+make undeploy
+```
+
+## CRDs
+
+### MaintenanceOperatorConfig
+
+The MaintenanceOperatorConfig CRD is used for operator runtime configuration
+
+for more information refer to [api-reference](docs/api-reference.md)
+
+#### Example MaintenanceOperatorConfig
+
+```yaml
+apiVersion: maintenance.nvidia.com/v1alpha1
+kind: MaintenanceOperatorConfig
+metadata:
+  name: default
+  namespace: maintenance-operator
+spec:
+  logLevel: info
+  maxParallelOperations: 4
+```
+
+In this example we configure the following for the operator:
+
+* Log level (`logLevel`) is set to `info`
+* The max number of parallel maintenance operations (`maxParallelOperations`) is set to `4`
+
+### NodeMaintenance
+
+The NodeMaintenance CRD is used to request to perform a maintenance operation on a specific K8s node.
+In addition, it specifies which common (K8s related operations) need to happend in order to preare a node for maintenance.
+
+Once the node is ready for maintenance the operator will set `Ready` condition in `status` field to `True`
+After maintenance operation was done by the requestor, NodeMaintenance CR should be deleted to finish the maintenance operation.
+
+for more information refer to [api-reference](docs/api-reference.md)
+
+#### Example NodeMaintenance
+
+```yaml
+apiVersion: maintenance.nvidia.com/v1alpha1
+kind: NodeMaintenance
+metadata:
+  name: my-maintenance-operation
+  namespace: default
+spec:
+  requestorID: some.one.acme.com
+  nodeName: wokrer-01
+  cordon: true
+  waitForPodCompletion:
+    podSelector: "app=important"
+    timeoutSeconds: 0
+  drainSpec:
+    force: true
+    podSelector: ""
+    timeoutSeconds: 0
+    deleteEmptyDir: true
+    podEvictionFilters:
+    - byResourceNameRegex: nvidia.com/gpu-*
+    - byResourceNameRegex: nvidia.com/rdma*
+
+```
+
+In this example we request to perform maintenance for node `worker-1`.
+
+the following steps will occur before the node is marked as ready for maintenance:
+
+1. cordon of `worker-1` node
+2. waiting for pods with `app: important` label to finish
+3. draining of `worker-1` with the provided `drainSpec`
+    1. force draining of pods even if they dont belong to a controller
+    2. allow draining of pods with emptyDir mount
+    3. only drain pods that consume either `nvidia.com/gpu-*`, `nvidia.com/rdma*` resources
+
+once the node is ready for maintenance `Ready` condition will be `True`
+
+```bash
+$ kubectl get nodemaintenances.maintenance.nvidia.com -A
+NAME                       NODE        REQUESTOR           READY   PHASE   FAILED
+my-maintenance-operation   worker-01   some.one.acme.com   True    Ready   
+```
+
+## NodeMaintenance State Diagram
+
+```mermaid
+
+stateDiagram-v2
+  pending: maintenance request registered, waiting to be scheduled
+  scheduled: maintenance request scheduled
+  cordon: cordon node
+  waitForPodCompletion: wait for specified pods to complete
+  draining: node draining
+  ready: node ready for maintenance
+  requestorFailed: requestor failed the maintenance operations
+
+  [*] --> pending : NodeMaintenance created
+  pending --> scheduled : scheduler selected NodeMaintenance for maintenance, add finalizer
+  scheduled --> cordon : preparation for cordon completed
+  cordon --> waitForPodCompletion : cordon completed
+  waitForPodCompletion --> draining : finished waiting for pods
+  draining --> ready : drain operation completed successfully, node is ready for maintenance, Ready condition is set to True
+  ready --> requestorFailed : requestor has set RequestorFailed condition
+
+  pending --> [*] : object deleted
+  scheduled --> [*] : object deleted
+  cordon --> [*] : object marked for deletetion, cleanup before deletion
+  waitForPodCompletion --> [*] : object marked for deletetion, cleanup before deletion
+  draining --> [*] : object marked for deletetion, cleanup before deletion
+  ready --> [*] : object marked for deletetion, cleanup before deletion
+  requestorFailed --> [*] : RequestorFailed condition cleared by requestor or external user, object marked for deletion, cleanup before deletion
+```
diff --git a/api/v1alpha1/doc.go b/api/v1alpha1/doc.go
@@ -0,0 +1,20 @@
+/*
+  2024 NVIDIA CORPORATION & AFFILIATES
+
+  Licensed under the Apache License, Version 2.0 (the License);
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an AS IS BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+// Package v1alpha1 contains API Schema definitions for the maintenance.nvidia.com v1alpha1 API group
+// +kubebuilder:object:generate=true
+// +groupName=maintenance.nvidia.com
+package v1alpha1
diff --git a/api/v1alpha1/nodemaintenance_types.go b/api/v1alpha1/nodemaintenance_types.go
@@ -122,7 +122,7 @@ type WaitForPodCompletionSpec struct {
 
 // DrainSpec describes configuration for node drain during automatic upgrade
 type DrainSpec struct {
-	// Force indicates if force draining is allowed
+	// Force draining even if there are pods that do not declare a controller
 	// +kubebuilder:validation:Optional
 	// +kubebuilder:default:=false
 	Force bool `json:"force,omitempty"`

diff --git a/config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml b/config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml
@@ -81,7 +81,8 @@ spec:
                     type: boolean
                   force:
                     default: false
-                    description: Force indicates if force draining is allowed
+                    description: Force draining even if there are pods that do not
+                      declare a controller
                     type: boolean
                   podEvictionFilters:
                     description: |-

diff --git a/deployment/maintenance-operator-chart/Chart.yaml b/deployment/maintenance-operator-chart/Chart.yaml
@@ -3,4 +3,4 @@ name: maintenance-operator-chart
 description: Maintenance Operator Helm Chart
 type: application
 version: 0.0.1
-appVersion: "v0.0.1-main"
+appVersion: "latest"
diff --git a/deployment/maintenance-operator-chart/README.md b/deployment/maintenance-operator-chart/README.md
@@ -0,0 +1,32 @@
+# maintenance-operator-chart
+
+![Version: 0.0.1](https://img.shields.io/badge/Version-0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: latest](https://img.shields.io/badge/AppVersion-latest-informational?style=flat-square)
+
+Maintenance Operator Helm Chart
+
+## Values
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| imagePullSecrets | list | `[]` | image pull secrets for the operator |
+| metricsService | object | `{"ports":[{"name":"https","port":8443,"protocol":"TCP","targetPort":"https"}],"type":"ClusterIP"}` | metrics service configurations |
+| operator.admissionController.certificates.certManager.enable | bool | `true` | use cert-manager for certificates |
+| operator.admissionController.certificates.certManager.generateSelfSigned | bool | `true` | generate self-signed certificiates with cert-manager |
+| operator.admissionController.certificates.custom.enable | bool | `false` | enable custom certificates using secrets |
+| operator.admissionController.certificates.secretNames.operator | string | `"operator-webhook-cert"` | secret name containing certificates for the operator admission controller |
+| operator.admissionController.enable | bool | `true` | enable admission controller of the operator |
+| operator.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/master","operator":"Exists"}]},"weight":1},{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | node affinity for the operator |
+| operator.image.repository | string | `"ghcr.io/mellanox/maintenance-operator"` | repository to use for the operator image |
+| operator.image.tag | string | `nil` | image tag to use for the operator image |
+| operator.nodeSelector | object | `{}` | node selector for the operator |
+| operator.replicas | int | `1` | operator deployment number of repplicas |
+| operator.resources | object | `{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}}` | specify resource requests and limits for the operator |
+| operator.serviceAccount.annotations | object | `{}` | set annotations for the operator service account |
+| operator.tolerations | list | `[{"effect":"NoSchedule","key":"node-role.kubernetes.io/master","operator":"Exists"},{"effect":"NoSchedule","key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]` | toleration for the operator |
+| operatorConfig | object | `{"logLevel":"info","maxNodeMaintenanceTimeSeconds":null,"maxParallelOperations":null,"maxUnavailable":null}` | operator configuration values. fields here correspond to fields in MaintenanceOperatorConfig CR |
+| operatorConfig.logLevel | string | `"info"` | log level configuration |
+| operatorConfig.maxNodeMaintenanceTimeSeconds | string | `nil` | max time for node maintenance |
+| operatorConfig.maxParallelOperations | string | `nil` | max number of parallel operations |
+| operatorConfig.maxUnavailable | string | `nil` | max number of unavailable nodes |
+| webhookService | object | `{"ports":[{"port":443,"protocol":"TCP","targetPort":9443}],"type":"ClusterIP"}` | webhook service configurations |
+
diff --git a/deployment/maintenance-operator-chart/crds/maintenance.nvidia.com_nodemaintenances.yaml b/deployment/maintenance-operator-chart/crds/maintenance.nvidia.com_nodemaintenances.yaml
@@ -81,7 +81,8 @@ spec:
                     type: boolean
                   force:
                     default: false
-                    description: Force indicates if force draining is allowed
+                    description: Force draining even if there are pods that do not
+                      declare a controller
                     type: boolean
                   podEvictionFilters:
                     description: |-

diff --git a/deployment/maintenance-operator-chart/values.yaml b/deployment/maintenance-operator-chart/values.yaml
@@ -1,15 +1,20 @@
 operator:
   image:
+    # -- repository to use for the operator image
     repository: ghcr.io/mellanox/maintenance-operator
-    #tag: latest
+    # -- image tag to use for the operator image
+    tag: null
+  # -- toleration for the operator
   tolerations:
     - key: "node-role.kubernetes.io/master"
       operator: "Exists"
       effect: "NoSchedule"
     - key: "node-role.kubernetes.io/control-plane"
       operator: "Exists"
       effect: "NoSchedule"
+  # -- node selector for the operator
   nodeSelector: {}
+  # -- node affinity for the operator
   affinity:
     nodeAffinity:
       preferredDuringSchedulingIgnoredDuringExecution:
@@ -23,25 +28,33 @@ operator:
             matchExpressions:
               - key: "node-role.kubernetes.io/control-plane"
                 operator: Exists
+  # -- specify resource requests and limits for the operator
   resources:
     limits:
       cpu: 500m
       memory: 128Mi
     requests:
       cpu: 10m
       memory: 64Mi
+  # -- operator deployment number of repplicas
   replicas: 1
   serviceAccount:
+    # -- set annotations for the operator service account
     annotations: {}
   admissionController:
+    # -- enable admission controller of the operator
     enable: true
     certificates:
       secretNames:
+        # -- secret name containing certificates for the operator admission controller
         operator: "operator-webhook-cert"
       certManager:
+        # -- use cert-manager for certificates
         enable: true
+        # -- generate self-signed certificiates with cert-manager
         generateSelfSigned: true
       custom:
+        # -- enable custom certificates using secrets
         enable: false
         #   operator:
         #     caCrt: |
@@ -60,20 +73,30 @@ operator:
         #       ...
         #      -----END EC PRIVATE KEY-----
 
+# -- operator configuration values. fields here correspond to fields in MaintenanceOperatorConfig CR
 operatorConfig:
+  # -- log level configuration
   logLevel: info
-#  maxParallelOperations: nil
-#  maxUnavailable: nil
-#  maxNodeMaintenanceTimeSeconds: 1600
+  # operatorConfig.maxParallelOperations -- max number of parallel operations
+  maxParallelOperations: null
+  # -- max number of unavailable nodes
+  maxUnavailable: null
+  # -- max time for node maintenance
+  maxNodeMaintenanceTimeSeconds: null
 
+# -- image pull secrets for the operator
 imagePullSecrets: []
+
+# -- metrics service configurations
 metricsService:
   ports:
     - name: https
       port: 8443
       protocol: TCP
       targetPort: https
   type: ClusterIP
+
+# -- webhook service configurations
 webhookService:
   ports:
     - port: 443