Skip to content

Commit 0210a47

Browse files
aktechpre-commit-ci[bot]viniciusdc
authored
Add Grafana Loki integration (#2156)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Vinicius D. Cerutti <[email protected]>
1 parent 50e2f8a commit 0210a47

File tree

15 files changed

+521
-4
lines changed

15 files changed

+521
-4
lines changed

.cirun.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ runners:
44
- name: run-k8s-tests
55
# Cloud Provider: AWS
66
cloud: aws
7-
# Instance Type has 4 vcpu, 16 GiB memory, Up to 5 Gbps Network Performance
8-
instance_type: t3a.xlarge
7+
# Instance Type has 8 vcpu, 32 GiB memory, Up to 5 Gbps Network Performance
8+
instance_type: t3a.2xlarge
99
# Custom AMI with docker/cypress/hub pre-installed
1010
machine_image: ami-0a388df278199ff52
1111
# Region: Oregon

.github/workflows/test_local_integration.yaml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ jobs:
9696
sed -i -E 's/(cpu_guarantee):\s+[0-9\.]+/\1: 0.25/g' "nebari-config.yaml"
9797
sed -i -E 's/(mem_guarantee):\s+[A-Za-z0-9\.]+/\1: 0.25G/g' "nebari-config.yaml"
9898
99-
10099
# Change default JupyterLab theme
101100
cat >> nebari-config.yaml <<- EOM
102101
jupyterlab:
@@ -105,6 +104,16 @@ jobs:
105104
theme: JupyterLab Dark
106105
EOM
107106
107+
# Change default value for minio persistence size
108+
cat >> nebari-config.yaml <<- EOM
109+
monitoring:
110+
enabled: true
111+
overrides:
112+
minio:
113+
persistence:
114+
size: 1Gi
115+
EOM
116+
108117
cat nebari-config.yaml
109118
110119
- name: Deploy Nebari
@@ -115,7 +124,7 @@ jobs:
115124
- name: Basic kubectl checks after deployment
116125
if: always()
117126
run: |
118-
kubectl get all,cm,secret,ing -A
127+
kubectl get all,cm,secret,pv,pvc,ing -A
119128
120129
- name: Check github-actions.nebari.dev resolves
121130
run: |

RELEASE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ This file is copied to nebari-dev/nebari-docs using a GitHub Action. -->
1111

1212
## Upcoming Release
1313

14+
* Added Grafana Loki to aggregate, index and search logs
15+
1416
## Release 2024.1.1 - January 17, 2024
1517

1618
### Feature changes and enhancements

src/_nebari/stages/kubernetes_services/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,16 @@ class JHubApps(schema.Base):
199199
enabled: bool = False
200200

201201

202+
class MonitoringOverrides(schema.Base):
203+
loki: typing.Dict = {}
204+
promtail: typing.Dict = {}
205+
minio: typing.Dict = {}
206+
207+
202208
class Monitoring(schema.Base):
203209
enabled: bool = True
210+
overrides: MonitoringOverrides = MonitoringOverrides()
211+
minio_enabled: bool = True
204212

205213

206214
class JupyterLabPioneer(schema.Base):
@@ -381,6 +389,12 @@ class DaskGatewayInputVars(schema.Base):
381389

382390
class MonitoringInputVars(schema.Base):
383391
monitoring_enabled: bool = Field(alias="monitoring-enabled")
392+
minio_enabled: bool = Field(alias="minio-enabled")
393+
grafana_loki_overrides: List[str] = Field(alias="grafana-loki-overrides")
394+
grafana_promtail_overrides: List[str] = Field(alias="grafana-promtail-overrides")
395+
grafana_loki_minio_overrides: List[str] = Field(
396+
alias="grafana-loki-minio-overrides"
397+
)
384398

385399

386400
class TelemetryInputVars(schema.Base):
@@ -524,6 +538,14 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]):
524538

525539
monitoring_vars = MonitoringInputVars(
526540
monitoring_enabled=self.config.monitoring.enabled,
541+
minio_enabled=self.config.monitoring.minio_enabled,
542+
grafana_loki_overrides=[json.dumps(self.config.monitoring.overrides.loki)],
543+
grafana_promtail_overrides=[
544+
json.dumps(self.config.monitoring.overrides.promtail)
545+
],
546+
grafana_loki_minio_overrides=[
547+
json.dumps(self.config.monitoring.overrides.minio)
548+
],
527549
)
528550

529551
telemetry_vars = TelemetryInputVars(
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
resource "random_password" "minio_root_password" {
2+
length = 32
3+
special = false
4+
}
5+
6+
locals {
7+
minio-url = "http://${var.minio-release-name}:${var.minio-port}"
8+
node-selector = {
9+
"${var.node-group.key}" = "${var.node-group.value}"
10+
}
11+
}
12+
13+
resource "helm_release" "loki-minio" {
14+
count = var.minio-enabled ? 1 : 0
15+
name = var.minio-release-name
16+
namespace = var.namespace
17+
repository = "https://raw.githubusercontent.com/bitnami/charts/defb094c658024e4aa8245622dab202874880cbc/bitnami"
18+
chart = "minio"
19+
# last release that was Apache-2.0
20+
version = var.minio-helm-chart-version
21+
22+
set {
23+
name = "accessKey.password"
24+
value = "admin"
25+
}
26+
27+
set {
28+
name = "secretKey.password"
29+
value = random_password.minio_root_password.result
30+
}
31+
32+
set {
33+
name = "defaultBuckets"
34+
value = join(" ", var.buckets)
35+
}
36+
37+
set {
38+
name = "persistence.size"
39+
value = var.minio-storage
40+
}
41+
42+
values = concat([
43+
file("${path.module}/values_minio.yaml"),
44+
jsonencode({
45+
nodeSelector : local.node-selector
46+
})
47+
], var.grafana-loki-minio-overrides)
48+
}
49+
50+
51+
resource "helm_release" "grafana-loki" {
52+
name = "nebari-loki"
53+
namespace = var.namespace
54+
repository = "https://grafana.github.io/helm-charts"
55+
chart = "loki"
56+
version = var.loki-helm-chart-version
57+
58+
values = concat([
59+
file("${path.module}/values_loki.yaml"),
60+
jsonencode({
61+
loki : {
62+
storage : {
63+
s3 : {
64+
endpoint : local.minio-url,
65+
accessKeyId : "admin"
66+
secretAccessKey : random_password.minio_root_password.result,
67+
s3ForcePathStyle : true
68+
}
69+
}
70+
}
71+
storageConfig : {
72+
# We configure MinIO by using the AWS config because MinIO implements the S3 API
73+
aws : {
74+
s3 : local.minio-url
75+
s3ForcePathStyle : true
76+
}
77+
}
78+
write : { nodeSelector : local.node-selector }
79+
read : { nodeSelector : local.node-selector }
80+
backend : { nodeSelector : local.node-selector }
81+
gateway : { nodeSelector : local.node-selector }
82+
})
83+
], var.grafana-loki-overrides)
84+
85+
depends_on = [helm_release.loki-minio]
86+
}
87+
88+
resource "helm_release" "grafana-promtail" {
89+
# Promtail ships the contents of logs to Loki instance
90+
name = "nebari-promtail"
91+
namespace = var.namespace
92+
repository = "https://grafana.github.io/helm-charts"
93+
chart = "promtail"
94+
version = var.promtail-helm-chart-version
95+
96+
values = concat([
97+
file("${path.module}/values_promtail.yaml"),
98+
jsonencode({
99+
})
100+
], var.grafana-promtail-overrides)
101+
102+
depends_on = [helm_release.grafana-loki]
103+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# https://github.com/grafana/loki/blob/4cae003ecedd474e4c15feab4ea2ef435afff83f/production/helm/loki/values.yaml
2+
3+
loki:
4+
storage:
5+
type: s3
6+
commonConfig:
7+
replication_factor: 1
8+
# Not required as it is inside cluster and not exposed to the public network
9+
auth_enabled: false
10+
11+
# The Compactor deduplicates index entries and also apply granular retention.
12+
compactor:
13+
# is the directory where marked chunks and temporary tables will be saved.
14+
working_directory: /var/loki/compactor/data/retention
15+
# minio s3
16+
shared_store: s3
17+
# how often compaction will happen
18+
compaction_interval: 1h
19+
# should delete old logs after retention delete delay
20+
# ideally we would want to do storage based retention, but this is not
21+
# currently implemented in loki, that's why we're doing time based retention.
22+
retention_enabled: true
23+
# is the delay after which the Compactor will delete marked chunks.
24+
retention_delete_delay: 1h
25+
# specifies the maximum quantity of goroutine workers instantiated to delete chunks.
26+
retention_delete_worker_count: 150
27+
28+
limits_config:
29+
# The minimum retention period is 24h.
30+
# This is reasonable in most cases, but if people would like to retain logs for longer
31+
# then they can override this variable from nebari-config.yaml
32+
retention_period: 60d
33+
34+
schema_config:
35+
configs:
36+
# list of period_configs
37+
# The date of the first day that index buckets should be created.
38+
- from: "2024-03-01"
39+
index:
40+
period: 24h
41+
prefix: loki_index_
42+
object_store: s3
43+
schema: v11
44+
store: boltdb-shipper
45+
storage_config:
46+
boltdb_shipper:
47+
# Directory where ingesters would write index files which would then be
48+
# uploaded by shipper to configured storage
49+
active_index_directory: /var/loki/compactor/data/index
50+
# Cache location for restoring index files from storage for queries
51+
cache_location: /var/loki/compactor/data/boltdb-cache
52+
# Shared store for keeping index files
53+
shared_store: s3
54+
55+
# Configuration for the write pod(s)
56+
write:
57+
# -- Number of replicas for the write
58+
# Keeping cost of running Nebari in mind
59+
# We don't need so many replicas, if people need it
60+
# they can always override from nebari-config.yaml
61+
replicas: 1
62+
63+
read:
64+
# -- Number of replicas for the read
65+
replicas: 1
66+
67+
backend:
68+
# -- Number of replicas for the backend
69+
replicas: 1
70+
71+
minio:
72+
# We are deploying minio from bitnami chart separately
73+
enabled: false
74+
75+
monitoring:
76+
selfMonitoring:
77+
grafanaAgent:
78+
installOperator: false
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# https://github.com/bitnami/charts/blob/440ec159c26e4ff0748b9e9866b345d98220c40a/bitnami/minio/values.yaml
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# https://github.com/grafana/helm-charts/blob/3831194ba2abd2a0ca7a14ca00e578f8e9d2abc6/charts/promtail/values.yaml
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
variable "namespace" {
2+
description = "deploy monitoring services on this namespace"
3+
type = string
4+
default = "dev"
5+
}
6+
7+
variable "loki-helm-chart-version" {
8+
description = "version to deploy for the loki helm chart"
9+
type = string
10+
default = "5.43.3"
11+
}
12+
13+
variable "promtail-helm-chart-version" {
14+
description = "version to deploy for the promtail helm chart"
15+
type = string
16+
default = "6.15.5"
17+
}
18+
19+
variable "minio-helm-chart-version" {
20+
description = "version to deploy for the minio helm chart"
21+
type = string
22+
default = "6.7.4"
23+
}
24+
25+
variable "grafana-loki-overrides" {
26+
description = "Grafana Loki helm chart overrides"
27+
type = list(string)
28+
default = []
29+
}
30+
31+
variable "grafana-promtail-overrides" {
32+
description = "Grafana Promtail helm chart overrides"
33+
type = list(string)
34+
default = []
35+
}
36+
37+
variable "grafana-loki-minio-overrides" {
38+
description = "Grafana Loki minio helm chart overrides"
39+
type = list(string)
40+
default = []
41+
}
42+
43+
variable "minio-release-name" {
44+
description = "Grafana Loki minio release name"
45+
type = string
46+
default = "nebari-loki-minio"
47+
}
48+
49+
variable "minio-port" {
50+
description = "Grafana Loki minio port"
51+
type = number
52+
default = 9000
53+
}
54+
55+
variable "buckets" {
56+
description = "Minio buckets"
57+
type = list(string)
58+
default = [
59+
"chunks",
60+
"ruler",
61+
"admin",
62+
"loki"
63+
]
64+
}
65+
66+
variable "minio-storage" {
67+
description = "Minio storage"
68+
type = string
69+
default = "50Gi"
70+
}
71+
72+
variable "minio-enabled" {
73+
description = "Deploy minio along with loki or not"
74+
type = bool
75+
default = true
76+
}
77+
78+
variable "node-group" {
79+
description = "Node key value pair for bound resources"
80+
type = object({
81+
key = string
82+
value = string
83+
})
84+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
11
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml
2+
3+
grafana:
4+
additionalDataSources:
5+
- name: Loki
6+
type: loki
7+
url: http://loki-gateway.dev

0 commit comments

Comments
 (0)