From da48c304648ec45956c97ed66fc1af5ea578fa6f Mon Sep 17 00:00:00 2001
From: adrianc <adrianc@nvidia.com>
Date: Mon, 2 Sep 2024 13:11:06 +0300
Subject: [PATCH] Add docs

- Add Readme to project with basic install and usage
- Add API documentation
- Add Helm chart documentation

Signed-off-by: adrianc <adrianc@nvidia.com>
---
 Makefile                                      |  25 +
 README.md                                     | 143 +++++-
 api/v1alpha1/doc.go                           |  20 +
 api/v1alpha1/nodemaintenance_types.go         |   2 +-
 .../maintenance-operator-chart/Chart.yaml     |   2 +-
 .../maintenance-operator-chart/README.md      |  32 ++
 .../maintenance-operator-chart/values.yaml    |  31 +-
 docs/api-reference.md                         | 439 ++++++++++++++++++
 hack/api-docs/config.json                     |  28 ++
 hack/api-docs/templates/members.tpl           |  48 ++
 hack/api-docs/templates/pkg.tpl               |  49 ++
 hack/api-docs/templates/type.tpl              |  82 ++++
 12 files changed, 892 insertions(+), 9 deletions(-)
 create mode 100644 api/v1alpha1/doc.go
 create mode 100644 deployment/maintenance-operator-chart/README.md
 create mode 100644 docs/api-reference.md
 create mode 100644 hack/api-docs/config.json
 create mode 100644 hack/api-docs/templates/members.tpl
 create mode 100644 hack/api-docs/templates/pkg.tpl
 create mode 100644 hack/api-docs/templates/type.tpl

diff --git a/Makefile b/Makefile
index 95b0751..d2d361c 100644
--- a/Makefile
+++ b/Makefile
@@ -221,6 +221,18 @@ golangci-lint:
 	curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell dirname $(GOLANGCI_LINT)) $(GOLANGCI_LINT_VERSION) ;\
 	}
 
+GEN_CRD_API_REFERENCE_DOCS = $(LOCALBIN)/gen-crd-api-reference-docs
+.PHONY: gen-crd-api-reference-docs ## Download gen-crd-api-reference-docs locally if necessary
+gen-crd-api-reference-docs: $(GEN_CRD_API_REFERENCE_DOCS)
+$(GEN_CRD_API_REFERENCE_DOCS): | $(LOCALBIN)
+	@ GOBIN=$(LOCALBIN) go install github.com/ahmetb/gen-crd-api-reference-docs@latest
+
+HELM_DOCS = $(LOCALBIN)/helm-docs
+HELM_DOCS_VERSION ?= v1.14.2
+.PHONY: helm-docs ## Download helm-docs locally if necessary
+helm-docs: $(HELM_DOCS)
+$(HELM_DOCS): | $(LOCALBIN)
+	@ GOBIN=$(LOCALBIN) go install github.com/norwoodj/helm-docs/cmd/helm-docs@$(HELM_DOCS_VERSION)
 ##@ General
 
 # The help target prints out all targets with their descriptions organized
@@ -281,6 +293,19 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
 generate-mocks: mockery ## generate mock objects
 	PATH=$(LOCALBIN):$(PATH) go generate ./...
 
+
+.PHONY: generate-api-docs
+generate-api-docs: gen-crd-api-reference-docs ## generate api documentation
+	$(GEN_CRD_API_REFERENCE_DOCS) -api-dir=./api/v1alpha1 -config=${CURDIR}/hack/api-docs/config.json \
+	-template-dir=${CURDIR}/hack/api-docs/templates -out-file=$(BUILDDIR)/api-reference.html
+	$(CONTAINER_TOOL) run --rm --volume "`pwd`:/data:Z" pandoc/minimal -f html -t markdown_strict \
+	--columns 200 /data/build/api-reference.html -o /data/docs/api-reference.md
+	rm $(BUILDDIR)/api-reference.html
+
+.PHONY: generate-helm-docs
+generate-helm-docs: helm-docs ## generate helm documentation
+	cd deployment/maintenance-operator-chart && $(HELM_DOCS)
+
 ##@ Build
 
 .PHONY: build
diff --git a/README.md b/README.md
index 9b02769..41c0c09 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,144 @@
 [![CodeQL](https://github.com/Mellanox/maintenance-operator/actions/workflows/codeql.yml/badge.svg)](https://github.com/Mellanox/maintenance-operator/actions/workflows/codeql.yml)
 [![Image push](https://github.com/Mellanox/maintenance-operator/actions/workflows/image-push-main.yml/badge.svg?event=push)](https://github.com/Mellanox/maintenance-operator/actions/workflows/image-push-main.yml)
 
-# Nvidia Maintenance Operator
-coordinates node maintenance operations in K8s cluster
+# NVIDIA Maintenance Operator
 
-> __NOTE__: This project is currently under active development.
+NVIDIA Maintenance Operator provides Kubernetes API(Custom Resource Definition) to allow node maintenance operators in K8s cluster
+in a coordinated manner. It performs some common operations to prepare a node for maintenance such as cordoning
+the node as well as draining it.
+
+Users/Consumers can request to perform maintenance on a node by creating NodeMaintenance Custom Resource(CR).
+The operator will then reconcile NodeMaintenance CRs. At high level this the the reconcile flow:
+
+1. Scheduling - schedule NodeMaintenance to be processed by the operator, taking into account constraints
+  such as the maximal allowed parallel operations.
+2. Node preparation for maintenance such as cordon and draning of the node
+3. Mark NodeMaintenance as Ready (via condition)
+4. Cleanup on deletion of NodeMaintenance such as node uncordon
+
+## Deployment
+
+### Prerequisites
+
+* Kubernetes cluster
+
+### Helm
+
+#### Deploy latest from project sources
+
+```bash
+# Clone project
+git clone https://github.com/Mellanox/maintenance-operator.git ; cd maintenance-operator
+
+# Install Operator
+helm install -n maintenance-operator --create-namespace --set operator.image.tag=latest maintenance-operator ./deployment/maintenance-operator-chart
+
+# View deployed resources
+kubectl -n maintenance-operator get all
+```
+
+#### Deploy last release from OCI repo
+
+```bash
+helm install -n maintenance-operator --create-namespace maintenance-operator oci://ghcr.io/mellanox/maintenance-operator-chart
+```
+
+### Kustomize (for development)
+
+```bash
+# clone project
+git clone https://github.com/Mellanox/maintenance-operator.git ; cd maintenance-operator
+
+# build image
+IMG=harbor.mellanox.com/cloud-orchestration-dev/adrianc/maintenance-operator:latest make docker-build
+
+# push image
+IMG=harbor.mellanox.com/cloud-orchestration-dev/adrianc/maintenance-operator:latest make docker-push
+
+# deploy
+IMG=harbor.mellanox.com/cloud-orchestration-dev/adrianc/maintenance-operator:latest make deploy
+
+# undeploy
+make undeploy
+```
+
+## CRDs
+
+### MaintenanceOperatorConfig
+
+The MaintenanceOperatorConfig CRD is used for operator runtime configuration
+
+for more information refer to [api-reference](docs/api-reference.md)
+
+#### Example MaintenanceOperatorConfig
+
+```yaml
+apiVersion: maintenance.nvidia.com/v1alpha1
+kind: MaintenanceOperatorConfig
+metadata:
+  name: default
+  namespace: maintenance-operator
+spec:
+  logLevel: info
+  maxParallelOperations: 4
+```
+
+In this example we configure the following for the operator:
+
+* Log level (`logLevel`) is set to `info`
+* The max number of parallel maintenance operations (`maxParallelOperations`) is set to `4`
+
+### NodeMaintenance
+
+The NodeMaintenance CRD is used to request to perform a maintenance operation on a specific K8s node.
+In addition, it specifies which common (K8s related operations) need to happend in order to preare a node for maintenance.
+
+Once the node is ready for maintenance the operator will set `Ready` condition in `status` field to `True`
+After maintenance operation was done by the requestor, NodeMaintenance CR should be deleted to finish the maintenance operation.
+
+for more information refer to [api-reference](docs/api-reference.md)
+
+#### Example NodeMaintenance
+
+```yaml
+apiVersion: maintenance.nvidia.com/v1alpha1
+kind: NodeMaintenance
+metadata:
+  name: my-maintenance-operation
+  namespace: default
+spec:
+  requestorID: some.one.acme.com
+  nodeName: wokrer-01
+  cordon: true
+  waitForPodCompletion:
+    podSelector: "app=important"
+    timeoutSeconds: 0
+  drainSpec:
+    force: true
+    podSelector: ""
+    timeoutSeconds: 0
+    deleteEmptyDir: true
+    podEvictionFilters:
+    - byResourceNameRegex: nvidia.com/gpu-*
+    - byResourceNameRegex: nvidia.com/rdma*
+
+```
+
+In this example we sequest to perform maintenance for node `worker-1`.
+
+the following steps will occur before the node is marked as ready for maintenance:
+
+1. cordon of `worker-1` node
+2. waiting for pods with `app: important` label to finish
+3. draining of `worker-1` with the provided `drainSpec`
+    1. force draining of pods even if they dont belong to a controller
+    2. allow draining of pods with emptyDir mount
+    3. only drain pods that consume either `nvidia.com/gpu-*`, `nvidia.com/rdma*` resources
+
+once the node is ready for maintenance `Ready` condition will be `True`
+
+```bash
+$ kubectl get nodemaintenances.maintenance.nvidia.com -A
+NAME                       NODE        REQUESTOR           READY   PHASE   FAILED
+my-maintenance-operation   worker-01   some.one.acme.com   True    Ready   
+```
diff --git a/api/v1alpha1/doc.go b/api/v1alpha1/doc.go
new file mode 100644
index 0000000..4f301ba
--- /dev/null
+++ b/api/v1alpha1/doc.go
@@ -0,0 +1,20 @@
+/*
+  2024 NVIDIA CORPORATION & AFFILIATES
+
+  Licensed under the Apache License, Version 2.0 (the License);
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an AS IS BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+// Package v1alpha1 contains API Schema definitions for the maintenance.nvidia.com v1alpha1 API group
+// +kubebuilder:object:generate=true
+// +groupName=maintenance.nvidia.com
+package v1alpha1
diff --git a/api/v1alpha1/nodemaintenance_types.go b/api/v1alpha1/nodemaintenance_types.go
index d61913b..6fa4d64 100644
--- a/api/v1alpha1/nodemaintenance_types.go
+++ b/api/v1alpha1/nodemaintenance_types.go
@@ -122,7 +122,7 @@ type WaitForPodCompletionSpec struct {
 
 // DrainSpec describes configuration for node drain during automatic upgrade
 type DrainSpec struct {
-	// Force indicates if force draining is allowed
+	// Force draining even if there are pods that do not declare a controller
 	// +kubebuilder:validation:Optional
 	// +kubebuilder:default:=false
 	Force bool `json:"force,omitempty"`
diff --git a/deployment/maintenance-operator-chart/Chart.yaml b/deployment/maintenance-operator-chart/Chart.yaml
index 8ebbc68..8b73ff4 100644
--- a/deployment/maintenance-operator-chart/Chart.yaml
+++ b/deployment/maintenance-operator-chart/Chart.yaml
@@ -3,4 +3,4 @@ name: maintenance-operator-chart
 description: Maintenance Operator Helm Chart
 type: application
 version: 0.0.1
-appVersion: "v0.0.1-main"
+appVersion: "latest"
diff --git a/deployment/maintenance-operator-chart/README.md b/deployment/maintenance-operator-chart/README.md
new file mode 100644
index 0000000..d778a54
--- /dev/null
+++ b/deployment/maintenance-operator-chart/README.md
@@ -0,0 +1,32 @@
+# maintenance-operator-chart
+
+![Version: 0.0.1](https://img.shields.io/badge/Version-0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: latest](https://img.shields.io/badge/AppVersion-latest-informational?style=flat-square)
+
+Maintenance Operator Helm Chart
+
+## Values
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| imagePullSecrets | list | `[]` | image pull secrets for the operator |
+| metricsService | object | `{"ports":[{"name":"https","port":8443,"protocol":"TCP","targetPort":"https"}],"type":"ClusterIP"}` | metrics service configurations |
+| operator.admissionController.certificates.certManager.enable | bool | `true` | use cert-manager for certificates |
+| operator.admissionController.certificates.certManager.generateSelfSigned | bool | `true` | generate self-signed certificiates with cert-manager |
+| operator.admissionController.certificates.custom.enable | bool | `false` | enable custom certificates using secrets |
+| operator.admissionController.certificates.secretNames.operator | string | `"operator-webhook-cert"` | secret name containing certificates for the operator admission controller |
+| operator.admissionController.enable | bool | `true` | enable admission controller of the operator |
+| operator.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/master","operator":"Exists"}]},"weight":1},{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | node affinity for the operator |
+| operator.image.repository | string | `"ghcr.io/mellanox/maintenance-operator"` | repository to use for the operator image |
+| operator.image.tag | string | `nil` | image tag to use for the operator image |
+| operator.nodeSelector | object | `{}` | node selector for the operator |
+| operator.replicas | int | `1` | operator deployment number of repplicas |
+| operator.resources | object | `{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}}` | specify resource requests and limits for the operator |
+| operator.serviceAccount.annotations | object | `{}` | set annotations for the operator service account |
+| operator.tolerations | list | `[{"effect":"NoSchedule","key":"node-role.kubernetes.io/master","operator":"Exists"},{"effect":"NoSchedule","key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]` | toleration for the operator |
+| operatorConfig | object | `{"logLevel":"info","maxNodeMaintenanceTimeSeconds":null,"maxParallelOperations":null,"maxUnavailable":null}` | operator configuration values. fields here correspond to fields in MaintenanceOperatorConfig CR |
+| operatorConfig.logLevel | string | `"info"` | log level configuration |
+| operatorConfig.maxNodeMaintenanceTimeSeconds | string | `nil` | max time for node maintenance |
+| operatorConfig.maxParallelOperations | string | `nil` | max number of parallel operations |
+| operatorConfig.maxUnavailable | string | `nil` | max number of unavailable nodes |
+| webhookService | object | `{"ports":[{"port":443,"protocol":"TCP","targetPort":9443}],"type":"ClusterIP"}` | webhook service configurations |
+
diff --git a/deployment/maintenance-operator-chart/values.yaml b/deployment/maintenance-operator-chart/values.yaml
index 1177072..685b615 100644
--- a/deployment/maintenance-operator-chart/values.yaml
+++ b/deployment/maintenance-operator-chart/values.yaml
@@ -1,7 +1,10 @@
 operator:
   image:
+    # -- repository to use for the operator image
     repository: ghcr.io/mellanox/maintenance-operator
-    #tag: latest
+    # -- image tag to use for the operator image
+    tag: null
+  # -- toleration for the operator
   tolerations:
     - key: "node-role.kubernetes.io/master"
       operator: "Exists"
@@ -9,7 +12,9 @@ operator:
     - key: "node-role.kubernetes.io/control-plane"
       operator: "Exists"
       effect: "NoSchedule"
+  # -- node selector for the operator
   nodeSelector: {}
+  # -- node affinity for the operator
   affinity:
     nodeAffinity:
       preferredDuringSchedulingIgnoredDuringExecution:
@@ -23,6 +28,7 @@ operator:
             matchExpressions:
               - key: "node-role.kubernetes.io/control-plane"
                 operator: Exists
+  # -- specify resource requests and limits for the operator
   resources:
     limits:
       cpu: 500m
@@ -30,18 +36,25 @@ operator:
     requests:
       cpu: 10m
       memory: 64Mi
+  # -- operator deployment number of repplicas
   replicas: 1
   serviceAccount:
+    # -- set annotations for the operator service account
     annotations: {}
   admissionController:
+    # -- enable admission controller of the operator
     enable: true
     certificates:
       secretNames:
+        # -- secret name containing certificates for the operator admission controller
         operator: "operator-webhook-cert"
       certManager:
+        # -- use cert-manager for certificates
         enable: true
+        # -- generate self-signed certificiates with cert-manager
         generateSelfSigned: true
       custom:
+        # -- enable custom certificates using secrets
         enable: false
         #   operator:
         #     caCrt: |
@@ -60,13 +73,21 @@ operator:
         #       ...
         #      -----END EC PRIVATE KEY-----
 
+# -- operator configuration values. fields here correspond to fields in MaintenanceOperatorConfig CR
 operatorConfig:
+  # -- log level configuration
   logLevel: info
-#  maxParallelOperations: nil
-#  maxUnavailable: nil
-#  maxNodeMaintenanceTimeSeconds: 1600
+  # operatorConfig.maxParallelOperations -- max number of parallel operations
+  maxParallelOperations: null
+  # -- max number of unavailable nodes
+  maxUnavailable: null
+  # -- max time for node maintenance
+  maxNodeMaintenanceTimeSeconds: null
 
+# -- image pull secrets for the operator
 imagePullSecrets: []
+
+# -- metrics service configurations
 metricsService:
   ports:
     - name: https
@@ -74,6 +95,8 @@ metricsService:
       protocol: TCP
       targetPort: https
   type: ClusterIP
+
+# -- webhook service configurations
 webhookService:
   ports:
     - port: 443
diff --git a/docs/api-reference.md b/docs/api-reference.md
new file mode 100644
index 0000000..89c3be1
--- /dev/null
+++ b/docs/api-reference.md
@@ -0,0 +1,439 @@
+Packages:
+
+-   [maintenance.nvidia.com/v1alpha1](#maintenance.nvidia.com%2fv1alpha1)
+
+## maintenance.nvidia.com/v1alpha1
+
+Package v1alpha1 contains API Schema definitions for the maintenance.nvidia.com v1alpha1 API group
+
+Resource Types:
+
+### DrainSpec
+
+(*Appears on:*[NodeMaintenanceSpec](#maintenance.nvidia.com/v1alpha1.NodeMaintenanceSpec))
+
+DrainSpec describes configuration for node drain during automatic upgrade
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>force</code><br />
+<em>bool</em></td>
+<td><p>Force draining even if there are pods that do not declare a controller</p></td>
+</tr>
+<tr>
+<td><code>podSelector</code><br />
+<em>string</em></td>
+<td><p>PodSelector specifies a label selector to filter pods on the node that need to be drained For more details on label selectors, see: <a
+href="https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors">https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors</a></p></td>
+</tr>
+<tr>
+<td><code>timeoutSeconds</code><br />
+<em>int32</em></td>
+<td><p>TimeoutSecond specifies the length of time in seconds to wait before giving up drain, zero means infinite</p></td>
+</tr>
+<tr>
+<td><code>deleteEmptyDir</code><br />
+<em>bool</em></td>
+<td><p>DeleteEmptyDir indicates if should continue even if there are pods using emptyDir (local data that will be deleted when the node is drained)</p></td>
+</tr>
+<tr>
+<td><code>podEvictionFilters</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.PodEvictionFiterEntry">[]PodEvictionFiterEntry</a></em></td>
+<td><p>PodEvictionFilters specifies filters for pods that need to undergo eviction during drain. if specified. only pods that match PodEvictionFilters will be evicted during drain operation. if
+unspecified. all non-daemonset pods will be evicted. logical OR is performed between filter entires. logical AND is performed within different filters in a filter entry.</p></td>
+</tr>
+</tbody>
+</table>
+
+### DrainStatus
+
+(*Appears on:*[NodeMaintenanceStatus](#maintenance.nvidia.com/v1alpha1.NodeMaintenanceStatus))
+
+DrainStatus represents the status of draining for the node
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>totalPods</code><br />
+<em>int32</em></td>
+<td><p>TotalPods is the number of pods on the node at the time NodeMaintenance started draining</p></td>
+</tr>
+<tr>
+<td><code>evictionPods</code><br />
+<em>int32</em></td>
+<td><p>EvictionPods is the total number of pods that need to be evicted at the time NodeMaintenance started draining</p></td>
+</tr>
+<tr>
+<td><code>drainProgress</code><br />
+<em>int32</em></td>
+<td><p>DrainProgress represents the draining progress as percentage</p></td>
+</tr>
+<tr>
+<td><code>waitForEviction</code><br />
+<em>[]string</em></td>
+<td><p>WaitForEviction is the list of namespaced named pods that need to be evicted</p></td>
+</tr>
+</tbody>
+</table>
+
+### MaintenanceOperatorConfig
+
+MaintenanceOperatorConfig is the Schema for the maintenanceoperatorconfigs API
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>metadata</code><br />
+<em><a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta">Kubernetes meta/v1.ObjectMeta</a></em></td>
+<td>Refer to the Kubernetes API documentation for the fields of the <code>metadata</code> field.</td>
+</tr>
+<tr>
+<td><code>spec</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.MaintenanceOperatorConfigSpec">MaintenanceOperatorConfigSpec</a></em></td>
+<td><br />
+<br />
+&#10;<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<tbody>
+<tr>
+<td><code>maxParallelOperations</code><br />
+<em><a href="https://pkg.go.dev/k8s.io/apimachinery/pkg/util/intstr#IntOrString">k8s.io/apimachinery/pkg/util/intstr.IntOrString</a></em></td>
+<td><p>MaxParallelOperations indicates the maximal number nodes that can undergo maintenance at a given time. 0 means no limit value can be an absolute number (ex: 5) or a percentage of total nodes in
+the cluster (ex: 10%). absolute number is calculated from percentage by rounding up. defaults to 1. The actual number of nodes that can undergo maintenance may be lower depending on the value of
+MaintenanceOperatorConfigSpec.MaxUnavailable.</p></td>
+</tr>
+<tr>
+<td><code>maxUnavailable</code><br />
+<em><a href="https://pkg.go.dev/k8s.io/apimachinery/pkg/util/intstr#IntOrString">k8s.io/apimachinery/pkg/util/intstr.IntOrString</a></em></td>
+<td><p>MaxUnavailable is the maximum number of nodes that can become unavailable in the cluster. value can be an absolute number (ex: 5) or a percentage of total nodes in the cluster (ex: 10%).
+absolute number is calculated from percentage by rounding up. by default, unset. new nodes will not be processed if the number of unavailable node will exceed this value</p></td>
+</tr>
+<tr>
+<td><code>logLevel</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.OperatorLogLevel">OperatorLogLevel</a></em></td>
+<td><p>LogLevel is the operator logging level</p></td>
+</tr>
+<tr>
+<td><code>maxNodeMaintenanceTimeSeconds</code><br />
+<em>int32</em></td>
+<td><p>MaxNodeMaintenanceTimeSeconds is the time from when a NodeMaintenance is marked as ready (phase: Ready) until the NodeMaintenance is considered stale and removed by the operator. should be less
+than idle time for any autoscaler that is running. default to 30m (1600 seconds)</p></td>
+</tr>
+</tbody>
+</table></td>
+</tr>
+</tbody>
+</table>
+
+### MaintenanceOperatorConfigSpec
+
+(*Appears on:*[MaintenanceOperatorConfig](#maintenance.nvidia.com/v1alpha1.MaintenanceOperatorConfig))
+
+MaintenanceOperatorConfigSpec defines the desired state of MaintenanceOperatorConfig
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>maxParallelOperations</code><br />
+<em><a href="https://pkg.go.dev/k8s.io/apimachinery/pkg/util/intstr#IntOrString">k8s.io/apimachinery/pkg/util/intstr.IntOrString</a></em></td>
+<td><p>MaxParallelOperations indicates the maximal number nodes that can undergo maintenance at a given time. 0 means no limit value can be an absolute number (ex: 5) or a percentage of total nodes in
+the cluster (ex: 10%). absolute number is calculated from percentage by rounding up. defaults to 1. The actual number of nodes that can undergo maintenance may be lower depending on the value of
+MaintenanceOperatorConfigSpec.MaxUnavailable.</p></td>
+</tr>
+<tr>
+<td><code>maxUnavailable</code><br />
+<em><a href="https://pkg.go.dev/k8s.io/apimachinery/pkg/util/intstr#IntOrString">k8s.io/apimachinery/pkg/util/intstr.IntOrString</a></em></td>
+<td><p>MaxUnavailable is the maximum number of nodes that can become unavailable in the cluster. value can be an absolute number (ex: 5) or a percentage of total nodes in the cluster (ex: 10%).
+absolute number is calculated from percentage by rounding up. by default, unset. new nodes will not be processed if the number of unavailable node will exceed this value</p></td>
+</tr>
+<tr>
+<td><code>logLevel</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.OperatorLogLevel">OperatorLogLevel</a></em></td>
+<td><p>LogLevel is the operator logging level</p></td>
+</tr>
+<tr>
+<td><code>maxNodeMaintenanceTimeSeconds</code><br />
+<em>int32</em></td>
+<td><p>MaxNodeMaintenanceTimeSeconds is the time from when a NodeMaintenance is marked as ready (phase: Ready) until the NodeMaintenance is considered stale and removed by the operator. should be less
+than idle time for any autoscaler that is running. default to 30m (1600 seconds)</p></td>
+</tr>
+</tbody>
+</table>
+
+### NodeMaintenance
+
+NodeMaintenance is the Schema for the nodemaintenances API
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>metadata</code><br />
+<em><a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta">Kubernetes meta/v1.ObjectMeta</a></em></td>
+<td>Refer to the Kubernetes API documentation for the fields of the <code>metadata</code> field.</td>
+</tr>
+<tr>
+<td><code>spec</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.NodeMaintenanceSpec">NodeMaintenanceSpec</a></em></td>
+<td><br />
+<br />
+&#10;<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<tbody>
+<tr>
+<td><code>requestorID</code><br />
+<em>string</em></td>
+<td><p>RequestorID MUST follow domain name notation format (<a href="https://tools.ietf.org/html/rfc1035#section-2.3.1">https://tools.ietf.org/html/rfc1035#section-2.3.1</a>) It MUST be 63 characters
+or less, beginning and ending with an alphanumeric character ([a-z0-9A-Z]) with dashes (-), dots (.), and alphanumerics between. caller SHOULD NOT create multiple objects with same requestorID and
+nodeName. This field identifies the requestor of the operation.</p></td>
+</tr>
+<tr>
+<td><code>additionalRequestors</code><br />
+<em>[]string</em></td>
+<td><p>AdditionalRequestors is a set of additional requestor IDs which are using the same NodeMaintenance request. addition or removal of requiestor IDs to this list MUST be made with update operation
+(and retry on failure) which will replace the entire list.</p></td>
+</tr>
+<tr>
+<td><code>nodeName</code><br />
+<em>string</em></td>
+<td><p>NodeName is The name of the node that maintenance operation will be performed on creation fails if node obj does not exist (webhook)</p></td>
+</tr>
+<tr>
+<td><code>cordon</code><br />
+<em>bool</em></td>
+<td><p>Cordon if set, marks node as unschedulable during maintenance operation</p></td>
+</tr>
+<tr>
+<td><code>waitForPodCompletion</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.WaitForPodCompletionSpec">WaitForPodCompletionSpec</a></em></td>
+<td><p>WaitForPodCompletion specifies pods via selector to wait for completion before performing drain operation if not provided, will not wait for pods to complete</p></td>
+</tr>
+<tr>
+<td><code>drainSpec</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.DrainSpec">DrainSpec</a></em></td>
+<td><p>DrainSpec specifies how a node will be drained. if not provided, no draining will be performed.</p></td>
+</tr>
+</tbody>
+</table></td>
+</tr>
+<tr>
+<td><code>status</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.NodeMaintenanceStatus">NodeMaintenanceStatus</a></em></td>
+<td></td>
+</tr>
+</tbody>
+</table>
+
+### NodeMaintenanceSpec
+
+(*Appears on:*[NodeMaintenance](#maintenance.nvidia.com/v1alpha1.NodeMaintenance))
+
+NodeMaintenanceSpec defines the desired state of NodeMaintenance
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>requestorID</code><br />
+<em>string</em></td>
+<td><p>RequestorID MUST follow domain name notation format (<a href="https://tools.ietf.org/html/rfc1035#section-2.3.1">https://tools.ietf.org/html/rfc1035#section-2.3.1</a>) It MUST be 63 characters
+or less, beginning and ending with an alphanumeric character ([a-z0-9A-Z]) with dashes (-), dots (.), and alphanumerics between. caller SHOULD NOT create multiple objects with same requestorID and
+nodeName. This field identifies the requestor of the operation.</p></td>
+</tr>
+<tr>
+<td><code>additionalRequestors</code><br />
+<em>[]string</em></td>
+<td><p>AdditionalRequestors is a set of additional requestor IDs which are using the same NodeMaintenance request. addition or removal of requiestor IDs to this list MUST be made with update operation
+(and retry on failure) which will replace the entire list.</p></td>
+</tr>
+<tr>
+<td><code>nodeName</code><br />
+<em>string</em></td>
+<td><p>NodeName is The name of the node that maintenance operation will be performed on creation fails if node obj does not exist (webhook)</p></td>
+</tr>
+<tr>
+<td><code>cordon</code><br />
+<em>bool</em></td>
+<td><p>Cordon if set, marks node as unschedulable during maintenance operation</p></td>
+</tr>
+<tr>
+<td><code>waitForPodCompletion</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.WaitForPodCompletionSpec">WaitForPodCompletionSpec</a></em></td>
+<td><p>WaitForPodCompletion specifies pods via selector to wait for completion before performing drain operation if not provided, will not wait for pods to complete</p></td>
+</tr>
+<tr>
+<td><code>drainSpec</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.DrainSpec">DrainSpec</a></em></td>
+<td><p>DrainSpec specifies how a node will be drained. if not provided, no draining will be performed.</p></td>
+</tr>
+</tbody>
+</table>
+
+### NodeMaintenanceStatus
+
+(*Appears on:*[NodeMaintenance](#maintenance.nvidia.com/v1alpha1.NodeMaintenance))
+
+NodeMaintenanceStatus defines the observed state of NodeMaintenance
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>conditions</code><br />
+<em><a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta">[]Kubernetes meta/v1.Condition</a></em></td>
+<td><p>Conditions represents observations of NodeMaintenance current state</p></td>
+</tr>
+<tr>
+<td><code>waitForCompletion</code><br />
+<em>[]string</em></td>
+<td><p>WaitForCompletion is the list of namespaced named pods that we wait to complete</p></td>
+</tr>
+<tr>
+<td><code>drain</code><br />
+<em><a href="#maintenance.nvidia.com/v1alpha1.DrainStatus">DrainStatus</a></em></td>
+<td><p>Drain represents the drain status of the node</p></td>
+</tr>
+</tbody>
+</table>
+
+### OperatorLogLevel (`string` alias)
+
+(*Appears on:*[MaintenanceOperatorConfigSpec](#maintenance.nvidia.com/v1alpha1.MaintenanceOperatorConfigSpec))
+
+OperatorLogLevel is the operator log level. one of: \[“debug”, “info”, “error”\]
+
+### PodEvictionFiterEntry
+
+(*Appears on:*[DrainSpec](#maintenance.nvidia.com/v1alpha1.DrainSpec))
+
+PodEvictionFiterEntry defines filters for Pod evictions during drain operation
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>byResourceNameRegex</code><br />
+<em>string</em></td>
+<td><p>ByResourceNameRegex filters pods by the name of the resources they consume using regex.</p></td>
+</tr>
+</tbody>
+</table>
+
+### WaitForPodCompletionSpec
+
+(*Appears on:*[NodeMaintenanceSpec](#maintenance.nvidia.com/v1alpha1.NodeMaintenanceSpec))
+
+WaitForPodCompletionSpec describes the configuration for waiting on pods completion
+
+<table>
+<colgroup>
+<col style="width: 50%" />
+<col style="width: 50%" />
+</colgroup>
+<thead>
+<tr>
+<th>Field</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>podSelector</code><br />
+<em>string</em></td>
+<td><p>PodSelector specifies a label selector for the pods to wait for completion For more details on label selectors, see: <a
+href="https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors">https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors</a></p></td>
+</tr>
+<tr>
+<td><code>timeoutSeconds</code><br />
+<em>int32</em></td>
+<td><p>TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means infinite</p></td>
+</tr>
+</tbody>
+</table>
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+*Generated with `gen-crd-api-reference-docs` on git commit `7ade571`.*
diff --git a/hack/api-docs/config.json b/hack/api-docs/config.json
new file mode 100644
index 0000000..12a4e45
--- /dev/null
+++ b/hack/api-docs/config.json
@@ -0,0 +1,28 @@
+{
+    "hideMemberFields": [
+        "TypeMeta"
+    ],
+    "hideTypePatterns": [
+        "ParseError$",
+        "List$"
+    ],
+    "externalPackages": [
+        {
+            "typeMatchPrefix": "^k8s\\.io/apimachinery/pkg/util/intstr\\.IntOrString$",
+            "docsURLTemplate": "https://pkg.go.dev/k8s.io/apimachinery/pkg/util/intstr#IntOrString"
+        },
+        {
+            "typeMatchPrefix": "^k8s\\.io/apimachinery/pkg/apis/meta/v1\\.Duration$",
+            "docsURLTemplate": "https://pkg.go.dev/k8s.io/apimachinery/pkg/apis/meta/v1#Duration"
+        },
+        {
+            "typeMatchPrefix": "^k8s\\.io/(api|apimachinery/pkg/apis)/",
+            "docsURLTemplate": "https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#{{lower .TypeIdentifier}}-{{arrIndex .PackageSegments -1}}-{{arrIndex .PackageSegments -2}}"
+        }
+    ],
+    "typeDisplayNamePrefixOverrides": {
+        "k8s.io/api/": "Kubernetes ",
+        "k8s.io/apimachinery/pkg/apis/": "Kubernetes "
+    },
+    "markdownDisabled": false
+}
diff --git a/hack/api-docs/templates/members.tpl b/hack/api-docs/templates/members.tpl
new file mode 100644
index 0000000..448ce2f
--- /dev/null
+++ b/hack/api-docs/templates/members.tpl
@@ -0,0 +1,48 @@
+{{ define "members" }}
+
+{{ range .Members }}
+{{ if not (hiddenMember .)}}
+<tr>
+    <td>
+        <code>{{ fieldName . }}</code><br/>
+        <em>
+            {{ if linkForType .Type }}
+                <a href="{{ linkForType .Type}}">
+                    {{ typeDisplayName .Type }}
+                </a>
+            {{ else }}
+                {{ typeDisplayName .Type }}
+            {{ end }}
+        </em>
+    </td>
+    <td>
+        {{ if fieldEmbedded . }}
+            <p>
+                (Members of <code>{{ fieldName . }}</code> are embedded into this type.)
+            </p>
+        {{ end}}
+
+        {{ if isOptionalMember .}}
+            <em>(Optional)</em>
+        {{ end }}
+
+        {{ safe (renderComments .CommentLines) }}
+
+    {{ if and (eq (.Type.Name.Name) "ObjectMeta") }}
+        Refer to the Kubernetes API documentation for the fields of the
+        <code>metadata</code> field.
+    {{ end }}
+
+    {{ if or (eq (fieldName .) "spec") }}
+        <br/>
+        <br/>
+        <table>
+            {{ template "members" .Type }}
+        </table>
+    {{ end }}
+    </td>
+</tr>
+{{ end }}
+{{ end }}
+
+{{ end }}
\ No newline at end of file
diff --git a/hack/api-docs/templates/pkg.tpl b/hack/api-docs/templates/pkg.tpl
new file mode 100644
index 0000000..aacf6e0
--- /dev/null
+++ b/hack/api-docs/templates/pkg.tpl
@@ -0,0 +1,49 @@
+{{ define "packages" }}
+
+{{ with .packages}}
+<p>Packages:</p>
+<ul>
+    {{ range . }}
+    <li>
+        <a href="#{{- packageAnchorID . -}}">{{ packageDisplayName . }}</a>
+    </li>
+    {{ end }}
+</ul>
+{{ end}}
+
+{{ range .packages }}
+    <h2 id="{{- packageAnchorID . -}}">
+        {{- packageDisplayName . -}}
+    </h2>
+
+    {{ with (index .GoPackages 0 )}}
+        {{ with .DocComments }}
+        <div>
+            {{ safe (renderComments .) }}
+        </div>
+        {{ end }}
+    {{ end }}
+
+    Resource Types:
+    <ul>
+    {{- range (visibleTypes (sortedTypes .Types)) -}}
+        {{ if isExportedType . -}}
+        <li>
+            <a href="{{ linkForType . }}">{{ typeDisplayName . }}</a>
+        </li>
+        {{- end }}
+    {{- end -}}
+    </ul>
+
+    {{ range (visibleTypes (sortedTypes .Types))}}
+        {{ template "type" .  }}
+    {{ end }}
+    <hr/>
+{{ end }}
+
+<p><em>
+    Generated with <code>gen-crd-api-reference-docs</code>
+    {{ with .gitCommit }} on git commit <code>{{ . }}</code>{{end}}.
+</em></p>
+
+{{ end }}
\ No newline at end of file
diff --git a/hack/api-docs/templates/type.tpl b/hack/api-docs/templates/type.tpl
new file mode 100644
index 0000000..9558611
--- /dev/null
+++ b/hack/api-docs/templates/type.tpl
@@ -0,0 +1,82 @@
+
+{{ define "type" }}
+
+<h3 id="{{ anchorIDForType . }}">
+    {{- .Name.Name }}
+    {{ if eq .Kind "Alias" }}(<code>{{.Underlying}}</code> alias){{ end -}}
+</h3>
+{{ with (typeReferences .) }}
+    <p>
+        (<em>Appears on:</em>
+        {{- $prev := "" -}}
+        {{- range . -}}
+            {{- if $prev -}}, {{ end -}}
+            {{- $prev = . -}}
+            <a href="{{ linkForType . }}">{{ typeDisplayName . }}</a>
+        {{- end -}}
+        )
+    </p>
+{{ end }}
+
+<div>
+    {{ safe (renderComments .CommentLines) }}
+</div>
+
+{{ with (constantsOfType .) }}
+<table>
+    <thead>
+        <tr>
+            <th>Value</th>
+            <th>Description</th>
+        </tr>
+    </thead>
+    <tbody>
+      {{- range . -}}
+      <tr>
+        {{- /*
+            renderComments implicitly creates a <p> element, so we
+            add one to the display name as well to make the contents
+            of the two cells align evenly.
+        */ -}}
+        <td><p>{{ typeDisplayName . }}</p></td>
+        <td>{{ safe (renderComments .CommentLines) }}</td>
+      </tr>
+      {{- end -}}
+    </tbody>
+</table>
+{{ end }}
+
+{{ if .Members }}
+<table>
+    <thead>
+        <tr>
+            <th>Field</th>
+            <th>Description</th>
+        </tr>
+    </thead>
+    <tbody>
+        {{ if isExportedType . }}
+        <tr>
+            <td>
+                <code>apiVersion</code><br/>
+                string</td>
+            <td>
+                <code>
+                    {{apiGroup .}}
+                </code>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <code>kind</code><br/>
+                string
+            </td>
+            <td><code>{{.Name.Name}}</code></td>
+        </tr>
+        {{ end }}
+        {{ template "members" .}}
+    </tbody>
+</table>
+{{ end }}
+
+{{ end }}