Release 0.2.0 (#183)

- Release a configurable CPU vertical scaling actuator. - CPU vertical scaling actuator as plugin. - Adoption of bi-direction gRPC stream on NextState-Planner comm. - Improvements on the analytical scripts for example usage. - Enhancements to RDT/DRC actuator. - Bump to k8s 1.26 dependencies and version CI tools. - Update and optimize the container base images. - PodState refactored to allow a better access to resources/annotation. - Many security improvements at container level. - Documentation updates. Co-authored-by: tmetsch <[email protected]> Co-authored-by: togashidm <[email protected]> Co-authored-by: Paulina-Osikoya <[email protected]> Co-authored-by: obiTrinobiIntel <[email protected]>
intel · Apr 12, 2023 · a0413eb · a0413eb
1 parent 443c316
commit a0413eb
Show file tree

Hide file tree

Showing 67 changed files with 3,633 additions and 1,202 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,19 @@
+_attic/
+vendor/
+bin/
+# coverage outputs etc.
+*.out
+*.dot*
+coverage.html
+# IDE related stuff.
+.vscode/
+.idea/
+# profiling
+*.test
+*.profile
+*.cpuprofile
+# in-tree folders not used for build process
+bin
+docs
+.git
+.github
diff --git a/.github/workflows/sca.yml b/.github/workflows/sca.yml
@@ -4,6 +4,8 @@ on:
     branches: [ '**' ]
   pull_request:
     branches: [ '**' ]
+permissions:
+  contents: read
 jobs:
   shellcheck:
     name: Shellcheck
@@ -20,19 +22,22 @@ jobs:
     name: Hadolint
     steps:
       - uses: actions/checkout@v3
-      - run: wget -q https://github.com/hadolint/hadolint/releases/download/v2.10.0/hadolint-Linux-x86_64 -O hadolint; chmod +x hadolint ; find . -type f \( -name "Dockerfile*" \) -print0 | xargs -n 1 -0 ./hadolint ;
+      - run: wget -q https://github.com/hadolint/hadolint/releases/download/v2.12.0/hadolint-Linux-x86_64 -O hadolint; chmod +x hadolint ; find . -type f \( -name "Dockerfile*" \) -print0 | xargs -n 1 -0 ./hadolint ;
   gofmt-imports:
     runs-on: ubuntu-latest
     name: Go Fmt and Go Import
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-go@v3
         with:
-          go-version-file: 'go.mod'
+          go-version: 1.19
       - run:  |
-          go install golang.org/x/tools/cmd/goimports@v0.1.12 && goimports -l . && gofmt -l .
+          go install golang.org/x/tools/cmd/goimports@v0.6.0 && goimports -l . && gofmt -l .
         shell: bash
   golangci:
+    permissions:
+      contents: read
+      pull-requests: read
     runs-on: ubuntu-latest
     name: lint
     steps:
@@ -41,8 +46,7 @@ jobs:
           go-version: 1.19
       - uses: actions/checkout@v3
       - name: golangci-lint
-        uses: golangci/golangci-lint-action@v3
-        with:
-          version: v1.50.0
-          # Additional linting tools can be added here
-          args: --enable=revive,errcheck,goimports,govet,nilerr,gosec --timeout=5m
+        run: |
+          go install github.com/golangci/golangci-lint/cmd/[email protected]
+          make golangci-lint
+        shell: bash
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
@@ -4,6 +4,8 @@ on:
     branches: [ '**' ]
   pull_request:
     branches: [ '**' ]
+permissions:
+  contents: read
 jobs:
   build:
     runs-on: ubuntu-latest

diff --git a/Dockerfile b/Dockerfile
@@ -8,12 +8,10 @@ WORKDIR /app
 COPY . ./
 
 RUN make prepare-build build \
-    && go run github.com/google/go-licenses@v1.3.1 save "./..." --save_path licenses \
+    && go run github.com/google/go-licenses@v1.6.0 save "./..." --save_path licenses \
     && hack/additional-licenses.sh
 
-FROM alpine:3.16
-
-RUN adduser -D nonroot
+FROM scratch
 
 WORKDIR /app
 

diff --git a/Makefile b/Makefile
@@ -2,8 +2,9 @@ BINARY_NAME=planner
 SCALEOUT_PLUGIN=scale_out
 RMPOD_PLUGIN=rm_pod
 RDT_PLUGIN=rdt
+CPU_PLUGIN=cpu_scale
 GO_CILINT_CHECKERS=errcheck,goimports,gosec,gosimple,govet,ineffassign,nilerr,revive,staticcheck,unused
-DOCKER_IMAGE_VERSION=0.1.1
+DOCKER_IMAGE_VERSION=0.2.0
 
 api:
 	hack/generate_code.sh
@@ -19,18 +20,28 @@ gen_code: api proto
 build:
 	CGO_ENABLED=0 go build -o bin/${BINARY_NAME} cmd/main.go
 
-build-plugins:
+build-plugin-scaleout:
 	CGO_ENABLED=0 go build -o bin/plugins/${SCALEOUT_PLUGIN} plugins/${SCALEOUT_PLUGIN}/cmd/${SCALEOUT_PLUGIN}.go
+
+build-plugin-rmpod:
 	CGO_ENABLED=0 go build -o bin/plugins/${RMPOD_PLUGIN} plugins/${RMPOD_PLUGIN}/cmd/${RMPOD_PLUGIN}.go
+
+build-plugin-rdt:
 	CGO_ENABLED=0 go build -o bin/plugins/${RDT_PLUGIN} plugins/${RDT_PLUGIN}/cmd/${RDT_PLUGIN}.go
 
+build-plugin-cpu:
+	CGO_ENABLED=0 go build -o bin/plugins/${CPU_PLUGIN} plugins/${CPU_PLUGIN}/cmd/${CPU_PLUGIN}.go
+
+build-plugins: build-plugin-scaleout build-plugin-rmpod build-plugin-rdt build-plugin-cpu
+
 controller-images:
-	docker build -t planner:${DOCKER_IMAGE_VERSION} .
+	docker build -t planner:${DOCKER_IMAGE_VERSION} . --no-cache --pull
 
 plugin-images:
-	docker build -t scaleout:${DOCKER_IMAGE_VERSION} -f plugins/scale_out/Dockerfile .
-	docker build -t rmpod:${DOCKER_IMAGE_VERSION} -f plugins/rm_pod/Dockerfile .
-	docker build -t rdt:${DOCKER_IMAGE_VERSION} -f plugins/rdt/Dockerfile .
+	docker build -t scaleout:${DOCKER_IMAGE_VERSION} -f plugins/scale_out/Dockerfile . --no-cache --pull
+	docker build -t rmpod:${DOCKER_IMAGE_VERSION} -f plugins/rm_pod/Dockerfile . --no-cache --pull
+	docker build -t rdt:${DOCKER_IMAGE_VERSION} -f plugins/rdt/Dockerfile . --no-cache --pull
+	docker build -t cpuscale:${DOCKER_IMAGE_VERSION} -f plugins/cpu_scale/Dockerfile . --no-cache --pull
 
 all-images: controller-images plugin-images
 

diff --git a/README.md b/README.md
@@ -58,14 +58,25 @@ Step 1) add the CRDs:
 
 Step 2) deploy the planner (make sure to adapt the configs to your environment):
 
-    $ k apply -f artefacts/deploy/manifest.yaml
+    $ k create ns ido
+    $ k apply -n ido -f artefacts/deploy/manifest.yaml
 
 Step 3) deploy the actuators of interest using:
 
-    $ k apply -f plugins/<name>/<name>.yaml
+    $ k apply -n ido -f plugins/<name>/<name>.yaml
 
 These steps should be followed by setting up your default profiles (if needed).
 
+We recommend the usage of a service mesh like [Linkerd](https://linkerd.io/) or [Istio](https://istio.io/) to ensure
+encryption and monitoring capabilities for the subcomponents of the planning framework themselves. After creating the
+namespace, enable auto-injection; For Linkerd do:
+
+    $ k annotate ns ido linkerd.io/inject=enabled
+
+or for Istio use:
+
+    $ k label namespace ido istio-injection=enabled --overwrite
+
 For more information on running and configuring the planner see the [getting started](docs/getting_started.md) guide.
 
 ## Internals

diff --git a/artefacts/deploy/manifest.yaml b/artefacts/deploy/manifest.yaml
@@ -102,12 +102,12 @@ rules:
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
-  namespace: default
   name: planner-role-binding
+  namespace: ido
 subjects:
   - kind: ServiceAccount
-    namespace: default
     name: planner-service-account
+    namespace: ido
 roleRef:
   kind: ClusterRole
   name: planner-role
@@ -122,28 +122,48 @@ metadata:
 spec:
   containers:
     - name: mongodb
-      image: mongo
+      image: mongo:6
       ports:
         - containerPort: 27017
+      securityContext:
+        capabilities:
+          drop: [ 'ALL' ]
+        seccompProfile:
+          type: RuntimeDefault
+        allowPrivilegeEscalation: false
+        readOnlyRootFilesystem: true
+        runAsNonRoot: true
+        runAsUser: 10001
+        runAsGroup: 10001
+      volumeMounts:
+        - name: mongo-tmp
+          mountPath: /tmp/
+        - name: data
+          mountPath: /data/db
       resources:
         limits:
           memory: "4000Mi"
           cpu: "2000m"
         requests:
           memory: "256Mi"
           cpu: "500m"
+  volumes:
+    - name: mongo-tmp
+      emptyDir:
+    - name: data
+      emptyDir:
   tolerations:
-  - key: node-role.kubernetes.io/master
-    operator: Exists
-  - key: node-role.kubernetes.io/control-plane
-    operator: Exists
+    - key: node-role.kubernetes.io/master
+      operator: Exists
+    - key: node-role.kubernetes.io/control-plane
+      operator: Exists
   affinity:
     nodeAffinity:
       requiredDuringSchedulingIgnoredDuringExecution:
         nodeSelectorTerms:
-        - matchExpressions:
-          - key: node-role.kubernetes.io/control-plane
-            operator: Exists
+          - matchExpressions:
+              - key: node-role.kubernetes.io/control-plane
+                operator: Exists
 ---
 apiVersion: v1
 kind: Service
@@ -179,17 +199,21 @@ spec:
   serviceAccountName: planner-service-account
   containers:
     - name: planner
-      image: 127.0.0.1:5000/planner:0.1.1
+      image: 127.0.0.1:5000/planner:0.2.0
       ports:
         - containerPort: 33333
       imagePullPolicy: Always
       args: [ "-config", "/config/defaults.json", "-v", "2" ]
       securityContext:
         capabilities:
-          drop:
-            - all
+          drop: [ 'ALL' ]
+        seccompProfile:
+          type: RuntimeDefault
+        allowPrivilegeEscalation: false
+        readOnlyRootFilesystem: true
         runAsNonRoot: true
         runAsUser: 10001
+        runAsGroup: 10001
       resources:
         limits:
           memory: "1000Mi"
@@ -207,17 +231,17 @@ spec:
         - name: MONGO_URL
           value: "mongodb://planner-mongodb-service:27017/"
   tolerations:
-  - key: node-role.kubernetes.io/master
-    operator: Exists
-  - key: node-role.kubernetes.io/control-plane
-    operator: Exists
+    - key: node-role.kubernetes.io/master
+      operator: Exists
+    - key: node-role.kubernetes.io/control-plane
+      operator: Exists
   affinity:
     nodeAffinity:
       requiredDuringSchedulingIgnoredDuringExecution:
         nodeSelectorTerms:
-        - matchExpressions:
-          - key: node-role.kubernetes.io/control-plane
-            operator: Exists
+          - matchExpressions:
+              - key: node-role.kubernetes.io/control-plane
+                operator: Exists
   volumes:
     - name: planner-config
       configMap:

diff --git a/artefacts/examples/example_deployment.yaml b/artefacts/examples/example_deployment.yaml
@@ -22,6 +22,16 @@ spec:
           env:
             - name: WORKERS
               value: "2"
+          securityContext:
+            capabilities:
+              drop: [ 'ALL' ]
+            seccompProfile:
+              type: RuntimeDefault
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            runAsNonRoot: true
+            runAsUser: 10001
+            runAsGroup: 10001
       restartPolicy: Always
 ---
 apiVersion: v1

diff --git a/defaults.json b/defaults.json
@@ -17,8 +17,8 @@
         "query": "avg(collectd_cpu_percent{exported_instance=~\"%s\"})by(exported_instance)"
       },
       {
-        "name": "llc_value",
-        "query": "avg(rate(collectd_intel_pmu_counter_total{type=\"cache-misses\", exported_instance=~\"%s\"}[30s]))by(exported_instance)"
+        "name": "ipc_value",
+        "query": "avg(rate(collectd_intel_pmu_counter_total{type=\"instructions\",exported_instance=~\"%[1]s\"}[30s]))by(exported_instance)/avg(rate(collectd_intel_pmu_counter_total{type=\"cpu-cycles\",exported_instance=~\"%[1]s\"}[30s]))by(exported_instance)"
       }
     ]
   },

diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -100,7 +100,7 @@ declared Intents. Over time the planner will learn a scaling model and use that
 For more details on models see the [actuators'](actuators.md) documentation.
 
 This intent declaration with the demo assumes a service mesh is used to measure the KPIs. The KPI profiles used match
-the default queries described earlier.  
+the default queries described earlier.
 
 **_Note_** that for this demonstration, it is assumed that proactive and opportunistic planning are enabled. See the
 configuration references for more details on this.
@@ -171,7 +171,6 @@ Each actuator will have its own configuration.
 | plugin_manager_endpoint  | String defining the plugin manager's endpoint to which actuators can register themselves. |
 | plugin_manager_port      | Port number of the plugin manager's endpoint to which actuators can register themselves.  |
 
-
 ### remove pod actuator
 
 | Property                | Description                                                                               |
@@ -184,6 +183,24 @@ Each actuator will have its own configuration.
 | plugin_manager_endpoint | String defining the plugin manager's endpoint to which actuators can register themselves. |
 | plugin_manager_port     | Port number of the plugin manager's endpoint to which actuators can register themselves.  |
 
+### cpu scale actuator
+
+| Property                     | Description                                                                                                                                                                                                |
+|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| interpreter                  | Path to a python interpreter.                                                                                                                                                                              |
+| analytics_script             | Path to the analytics python script used to determine the scaling model.                                                                                                                                   |
+| cpu_max                      | Maximum CPU resource units (in millis) that the actuator will allow.                                                                                                                                       |
+| cpu_rounding                 | Multiple of 10 defining how to round up CPU resource units.                                                                                                                                                |
+| cpu_safeguard_factor         | Define the factor the actuator will use to stay below the targeted objective.                                                                                                                              |
+| look_back                    | Time in minutes defining how old the ML model can be.                                                                                                                                                      |
+| max_proactive_cpu            | Maximum CPU resource units (in millis) that the actuator will allow when proactively scaling. If set to 0, proactive planning is disabled. A fraction of this value is used for proactive scale ups/downs. |
+| proactive_latency_percentage | Float defining the potential percentage change in latency by scaling the resources.                                                                                                                        |
+| endpoint                     | Name of the endpoint to use for registering this plugin.                                                                                                                                                   |
+| port                         | Port this actuator should listen on.                                                                                                                                                                       |
+| mongo_endpoint               | URI for the Mongo database - representing the knowledge base of the system.                                                                                                                                |
+| plugin_manager_endpoint      | String defining the plugin manager's endpoint to which actuators can register themselves.                                                                                                                  |
+| plugin_manager_port          | Port number of the plugin manager's endpoint to which actuators can register themselves.                                                                                                                   |
+
 ### RDT actuator
 
 | Property                | Description                                                                               |

diff --git a/docs/pluggability.md b/docs/pluggability.md
@@ -17,7 +17,8 @@ cycle:
 ![plugins, actuator-plugins, plugin-manager](fig/plugin_manager.png)
 
 After successful registration, the planner will call the functions **_NextState_**, **_Perform_** and **_Effect_** via
-[gRPC](https://grpc.io/).
+[gRPC](https://grpc.io/). The bi-directional streaming is implemented on **_NextState_** which has demonstrated an 
+improvement on the planner's performance.
 
 ## Implementing new plugins