Skip to content

Commit 617ee75

Browse files
googs1025kerthcet
andauthored
feat: metrics support for llmaz (#316)
* fix miss metrics endpoint bind Signed-off-by: googs1025 <[email protected]> * fix metrics-monitor.yaml * Update the prometheus config Signed-off-by: kerthcet <[email protected]> * add miss port in chart/templates/deployment.yaml * fix ClusterRoleBinding roleRef name, llmaz -> prometheus * add prometheus operator install docs * Fix prometheus could not fetch metrics Signed-off-by: kerthcet <[email protected]> * Update Signed-off-by: kerthcet <[email protected]> --------- Signed-off-by: googs1025 <[email protected]> Signed-off-by: kerthcet <[email protected]> Co-authored-by: kerthcet <[email protected]>
1 parent a4ddd8d commit 617ee75

File tree

12 files changed

+244
-10
lines changed

12 files changed

+244
-10
lines changed

Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,14 @@ envtest: $(ENVTEST) ## Download envtest-setup locally if necessary.
285285
$(ENVTEST): $(LOCALBIN)
286286
test -s $(LOCALBIN)/setup-envtest || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest
287287

288+
.PHONY: install-prometheus
289+
install-prometheus:
290+
kubectl apply --server-side -k config/prometheus
291+
292+
.PHONY: uninstall-prometheus
293+
uninstall-prometheus:
294+
kubectl delete -k config/prometheus
295+
288296
##@Release
289297

290298
.PHONY: artifacts
@@ -300,7 +308,7 @@ HELMIFY ?= $(LOCALBIN)/helmify
300308
.PHONY: helmify
301309
helmify: $(HELMIFY) ## Download helmify locally if necessary.
302310
$(HELMIFY): $(LOCALBIN)
303-
test -s $(LOCALBIN)/helmify || GOBIN=$(LOCALBIN) go install github.com/arttor/helmify/cmd/[email protected].17
311+
test -s $(LOCALBIN)/helmify || GOBIN=$(LOCALBIN) go install github.com/arttor/helmify/cmd/[email protected].18
304312

305313
.PHONY: helm
306314
helm: manifests kustomize helmify
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{{- if .Values.prometheus.enable }}
2+
{{- if not (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
3+
{{- fail "The cluster does not support the required API resource `monitoring.coreos.com/v1/ServiceMonitor`." }}
4+
{{- end }}
5+
apiVersion: monitoring.coreos.com/v1
6+
kind: Prometheus
7+
metadata:
8+
name: {{ include "chart.fullname" . }}-prometheus
9+
spec:
10+
serviceAccountName: {{ include "chart.fullname" . }}-prometheus
11+
# Associated ServiceMonitor selector
12+
serviceMonitorSelector:
13+
# Need to match the label in ServiceMonitor
14+
# https://github.com/kubernetes-sigs/jobset/blob/main/config/components/prometheus/monitor.yaml#L7
15+
matchLabels:
16+
control-plane: controller-manager
17+
{{- include "chart.selectorLabels" . | nindent 4 }}
18+
resources:
19+
requests:
20+
memory: 400Mi
21+
enableAdminAPI: false
22+
{{- end }}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{{- if .Values.prometheus.enable }}
2+
{{- if not (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
3+
{{- fail "The cluster does not support the required API resource `monitoring.coreos.com/v1/ServiceMonitor`." }}
4+
{{- end }}
5+
apiVersion: monitoring.coreos.com/v1
6+
kind: ServiceMonitor
7+
metadata:
8+
name: {{ include "chart.fullname" . }}-controller-manager-metrics-monitor
9+
labels:
10+
app.kubernetes.io/component: metrics
11+
app.kubernetes.io/created-by: llmaz
12+
app.kubernetes.io/part-of: llmaz
13+
control-plane: controller-manager
14+
{{- include "chart.selectorLabels" . | nindent 4 }}
15+
spec:
16+
endpoints:
17+
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
18+
path: /metrics
19+
port: https
20+
scheme: https
21+
tlsConfig:
22+
insecureSkipVerify: true
23+
selector:
24+
matchLabels:
25+
control-plane: controller-manager
26+
{{- include "chart.selectorLabels" . | nindent 4 }}
27+
{{- end }}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{{- if .Values.prometheus.enable }}
2+
{{- if not (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }}
3+
{{- fail "The cluster does not support the required API resource `monitoring.coreos.com/v1/ServiceMonitor`." }}
4+
{{- end }}
5+
apiVersion: v1
6+
kind: ServiceAccount
7+
metadata:
8+
name: {{ include "chart.fullname" . }}-prometheus
9+
---
10+
apiVersion: rbac.authorization.k8s.io/v1
11+
kind: ClusterRole
12+
metadata:
13+
name: {{ include "chart.fullname" . }}-prometheus
14+
rules:
15+
- apiGroups: [""]
16+
resources:
17+
- nodes
18+
- nodes/metrics
19+
- services
20+
- endpoints
21+
- pods
22+
verbs: ["get", "list", "watch"]
23+
- apiGroups: [""]
24+
resources:
25+
- configmaps
26+
verbs: ["get"]
27+
- nonResourceURLs: ["/metrics"]
28+
verbs: ["get"]
29+
---
30+
apiVersion: rbac.authorization.k8s.io/v1
31+
kind: ClusterRoleBinding
32+
metadata:
33+
name: {{ include "chart.fullname" . }}-prometheus
34+
roleRef:
35+
apiGroup: rbac.authorization.k8s.io
36+
kind: ClusterRole
37+
name: {{ include "chart.fullname" . }}-prometheus
38+
subjects:
39+
- kind: ServiceAccount
40+
name: {{ include "chart.fullname" . }}-prometheus
41+
namespace: llmaz-system
42+
{{- end }}

chart/values.global.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,7 @@ leaderWorkerSet:
2828
image:
2929
repository: registry.k8s.io/lws/lws
3030
tag: v0.5.0
31+
32+
prometheus:
33+
# -- Whether to enable Prometheus metrics exporting.
34+
enable: false
Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# This patch exposes 8443 port used by metrics service
21
apiVersion: apps/v1
32
kind: Deployment
43
metadata:
@@ -8,8 +7,8 @@ spec:
87
template:
98
spec:
109
containers:
11-
- name: manager
12-
ports:
13-
- containerPort: 8443
14-
name: metrics
15-
protocol: TCP
10+
- name: manager
11+
ports:
12+
- containerPort: 8443
13+
name: metrics
14+
protocol: TCP
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,7 @@
1+
namespace: llmaz-system
2+
namePrefix: llmaz-
3+
14
resources:
25
- monitor.yaml
6+
- prometheus.yaml
7+
- serviceaccount.yaml

config/prometheus/monitor.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,9 @@ metadata:
55
labels:
66
control-plane: controller-manager
77
app.kubernetes.io/name: servicemonitor
8-
app.kubernetes.io/instance: controller-manager-metrics-monitor
98
app.kubernetes.io/component: metrics
109
app.kubernetes.io/created-by: llmaz
1110
app.kubernetes.io/part-of: llmaz
12-
app.kubernetes.io/managed-by: kustomize
1311
name: controller-manager-metrics-monitor
1412
namespace: system
1513
spec:
@@ -22,4 +20,4 @@ spec:
2220
insecureSkipVerify: true
2321
selector:
2422
matchLabels:
25-
control-plane: controller-manager
23+
app.kubernetes.io/name: service

config/prometheus/prometheus.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: Prometheus
3+
metadata:
4+
name: prometheus
5+
namespace: system
6+
spec:
7+
serviceAccountName: llmaz-prometheus
8+
# Associated ServiceMonitor selector
9+
serviceMonitorSelector:
10+
# Need to match the label in ServiceMonitor
11+
matchLabels:
12+
control-plane: controller-manager
13+
resources:
14+
requests:
15+
memory: 400Mi
16+
enableAdminAPI: false
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
apiVersion: v1
2+
kind: ServiceAccount
3+
metadata:
4+
name: prometheus
5+
---
6+
apiVersion: rbac.authorization.k8s.io/v1
7+
kind: ClusterRole
8+
metadata:
9+
name: prometheus
10+
rules:
11+
- apiGroups: [""]
12+
resources:
13+
- nodes
14+
- nodes/metrics
15+
- services
16+
- endpoints
17+
- pods
18+
verbs: ["get", "list", "watch"]
19+
- apiGroups: [""]
20+
resources:
21+
- configmaps
22+
verbs: ["get"]
23+
- nonResourceURLs: ["/metrics"]
24+
verbs: ["get"]
25+
---
26+
apiVersion: rbac.authorization.k8s.io/v1
27+
kind: ClusterRoleBinding
28+
metadata:
29+
name: prometheus
30+
roleRef:
31+
apiGroup: rbac.authorization.k8s.io
32+
kind: ClusterRole
33+
name: prometheus
34+
subjects:
35+
- kind: ServiceAccount
36+
name: prometheus
37+
namespace: llmaz-system

0 commit comments

Comments
 (0)