From 06037530d24c9ec6670cf157c2baedaf775549a4 Mon Sep 17 00:00:00 2001 From: Francois LP Date: Wed, 17 Apr 2024 14:07:03 +0200 Subject: [PATCH 1/3] feat: add tail sampling for traces https://grafana.com/docs/grafana-cloud/monitor-applications/application-observability/setup/sampling/tail/ --- .../alloy_config/_processors.alloy.txt | 34 + examples/tail-sampling/README.md | 42 + examples/tail-sampling/events.alloy | 34 + examples/tail-sampling/logs.alloy | 135 + examples/tail-sampling/metrics.alloy | 802 + examples/tail-sampling/output.yaml | 51125 ++++++++++++++++ examples/tail-sampling/profiles.alloy | 0 examples/tail-sampling/values.yaml | 39 + 8 files changed, 52211 insertions(+) create mode 100644 examples/tail-sampling/README.md create mode 100644 examples/tail-sampling/events.alloy create mode 100644 examples/tail-sampling/logs.alloy create mode 100644 examples/tail-sampling/metrics.alloy create mode 100644 examples/tail-sampling/output.yaml create mode 100644 examples/tail-sampling/profiles.alloy create mode 100644 examples/tail-sampling/values.yaml diff --git a/charts/k8s-monitoring/templates/alloy_config/_processors.alloy.txt b/charts/k8s-monitoring/templates/alloy_config/_processors.alloy.txt index c2d27f961..f539374f3 100644 --- a/charts/k8s-monitoring/templates/alloy_config/_processors.alloy.txt +++ b/charts/k8s-monitoring/templates/alloy_config/_processors.alloy.txt @@ -212,6 +212,40 @@ otelcol.processor.transform "default" { } } + +{{- if and .Values.traces.enabled .Values.traces.receiver.tailsampling.policies }} +otelcol.processor.tail_sampling "default" { + {{- range $policy := .Values.traces.receiver.tailsampling.policies }} + policy { + name = {{ $policy.name | quote }} + type = {{ $policy.type | quote }} + {{- if and (eq $policy.type "latency") (hasKey $policy "latency") }} + latency = { + threshold_ms = {{ $policy.latency.thresholdMs | int }}, + } + {{- end }} + + {{- if and (eq $policy.type "string_attribute") (hasKey $policy "stringAttribute") }} + stringAttribute = { + key = {{ $policy.stringAttribute.key | quote }} + values = [ + {{- range $value := $policy.stringAttribute.values }} + {{ $value }}, + {{- end }} + ] + enabled_regex_matching = {{ $policy.stringAttribute.enabledRegexMatching | default true }} + invert_match = {{ $policy.stringAttribute.invertMatch | default false }} + } + {{- end }} + } + {{- end }} + + output { + traces = [otelcol.processor.batch.batch_processor.input] + } +} +{{- end }} + otelcol.processor.filter "default" { error_mode = "ignore" diff --git a/examples/tail-sampling/README.md b/examples/tail-sampling/README.md new file mode 100644 index 000000000..2743621f2 --- /dev/null +++ b/examples/tail-sampling/README.md @@ -0,0 +1,42 @@ +# Tail sampling + +This example contains an example of [tail sampling](https://grafana.com/docs/grafana-cloud/monitor-applications/application-observability/setup/sampling/tail/). + +```yaml +cluster: + name: tail-sampling-test + +externalServices: + prometheus: + host: https://prometheus.example.com + basicAuth: + username: 12345 + password: "It's a secret to everyone" + loki: + host: https://loki.example.com + basicAuth: + username: 12345 + password: "It's a secret to everyone" + tempo: + host: https://tempo.example.com + basicAuth: + username: 12345 + password: "It's a secret to everyone" + +metrics: + enabled: false + +logs: + enabled: false + +traces: + enabled: true + receiver: + tailsampling: + policies: + - name: all_traces_above_500 + type: latency + latency: + thresholdMs: 500 + +``` diff --git a/examples/tail-sampling/events.alloy b/examples/tail-sampling/events.alloy new file mode 100644 index 000000000..3ee5f7d8c --- /dev/null +++ b/examples/tail-sampling/events.alloy @@ -0,0 +1,34 @@ +// Cluster Events +loki.source.kubernetes_events "cluster_events" { + job_name = "integrations/kubernetes/eventhandler" + log_format = "logfmt" + forward_to = [loki.process.logs_service.receiver] +} + +// Logs Service +remote.kubernetes.secret "logs_service" { + name = "loki-k8s-monitoring" + namespace = "default" +} + +loki.process "logs_service" { + stage.static_labels { + values = { + cluster = "default-values-test", + } + } + forward_to = [loki.write.logs_service.receiver] +} + +// Loki +loki.write "logs_service" { + endpoint { + url = nonsensitive(remote.kubernetes.secret.logs_service.data["host"]) + "/loki/api/v1/push" + tenant_id = nonsensitive(remote.kubernetes.secret.logs_service.data["tenantId"]) + + basic_auth { + username = nonsensitive(remote.kubernetes.secret.logs_service.data["username"]) + password = remote.kubernetes.secret.logs_service.data["password"] + } + } +} diff --git a/examples/tail-sampling/logs.alloy b/examples/tail-sampling/logs.alloy new file mode 100644 index 000000000..1e4290ad8 --- /dev/null +++ b/examples/tail-sampling/logs.alloy @@ -0,0 +1,135 @@ +// Pod Logs +discovery.kubernetes "pods" { + role = "pod" + selectors { + role = "pod" + field = "spec.nodeName=" + env("HOSTNAME") + } +} + +discovery.relabel "pod_logs" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_namespace"] + action = "replace" + target_label = "namespace" + } + + rule { + source_labels = ["__meta_kubernetes_pod_name"] + action = "replace" + target_label = "pod" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + action = "replace" + target_label = "container" + } + rule { + source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"] + separator = "/" + action = "replace" + replacement = "$1" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"] + separator = "/" + action = "replace" + replacement = "/var/log/pods/*$1/*.log" + target_label = "__path__" + } + + // set the container runtime as a label + rule { + action = "replace" + source_labels = ["__meta_kubernetes_pod_container_id"] + regex = "^(\\w+):\\/\\/.+$" + replacement = "$1" + target_label = "tmp_container_runtime" + } +} + +discovery.relabel "filtered_pod_logs" { + targets = discovery.relabel.pod_logs.output + rule { // Drop anything with a "falsy" annotation value + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_logs_autogather"] + regex = "(false|no|skip)" + action = "drop" + } +} + +local.file_match "pod_logs" { + path_targets = discovery.relabel.filtered_pod_logs.output +} + +loki.source.file "pod_logs" { + targets = local.file_match.pod_logs.targets + forward_to = [loki.process.pod_logs.receiver] +} + +loki.process "pod_logs" { + stage.match { + selector = "{tmp_container_runtime=\"containerd\"}" + // the cri processing stage extracts the following k/v pairs: log, stream, time, flags + stage.cri {} + + // Set the extract flags and stream values as labels + stage.labels { + values = { + flags = "", + stream = "", + } + } + } + + // if the label tmp_container_runtime from above is docker parse using docker + stage.match { + selector = "{tmp_container_runtime=\"docker\"}" + // the docker processing stage extracts the following k/v pairs: log, stream, time + stage.docker {} + + // Set the extract stream value as a label + stage.labels { + values = { + stream = "", + } + } + } + + // Drop the filename label, since it's not really useful in the context of Kubernetes, where we already have + // cluster, namespace, pod, and container labels. + // Also drop the temporary container runtime label as it is no longer needed. + stage.label_drop { + values = ["filename", "tmp_container_runtime"] + } + forward_to = [loki.process.logs_service.receiver] +} + +// Logs Service +remote.kubernetes.secret "logs_service" { + name = "loki-k8s-monitoring" + namespace = "default" +} + +loki.process "logs_service" { + stage.static_labels { + values = { + cluster = "default-values-test", + } + } + forward_to = [loki.write.logs_service.receiver] +} + +// Loki +loki.write "logs_service" { + endpoint { + url = nonsensitive(remote.kubernetes.secret.logs_service.data["host"]) + "/loki/api/v1/push" + tenant_id = nonsensitive(remote.kubernetes.secret.logs_service.data["tenantId"]) + + basic_auth { + username = nonsensitive(remote.kubernetes.secret.logs_service.data["username"]) + password = remote.kubernetes.secret.logs_service.data["password"] + } + } +} diff --git a/examples/tail-sampling/metrics.alloy b/examples/tail-sampling/metrics.alloy new file mode 100644 index 000000000..432f68f06 --- /dev/null +++ b/examples/tail-sampling/metrics.alloy @@ -0,0 +1,802 @@ +discovery.kubernetes "nodes" { + role = "node" +} + +discovery.kubernetes "services" { + role = "service" +} + +discovery.kubernetes "endpoints" { + role = "endpoints" +} + +discovery.kubernetes "pods" { + role = "pod" +} + +// OTLP Receivers +otelcol.receiver.otlp "receiver" { + debug_metrics { + disable_high_cardinality_metrics = true + } + + grpc { + endpoint = "0.0.0.0:4317" + } + + http { + endpoint = "0.0.0.0:4318" + } + output { + metrics = [otelcol.processor.resourcedetection.default.input] + logs = [otelcol.processor.resourcedetection.default.input] + } +} + + + + +// Processors +otelcol.processor.transform "add_metric_datapoint_attributes" { + // Grafana Cloud Kubernetes monitoring expects Loki labels `cluster`, `pod`, and `namespace` + error_mode = "ignore" + metric_statements { + context = "datapoint" + statements = [ + "set(attributes[\"deployment.environment\"], resource.attributes[\"deployment.environment\"])", + "set(attributes[\"service.version\"], resource.attributes[\"service.version\"])", + ] + } + output { + metrics = [otelcol.processor.k8sattributes.default.input] + } +} + +otelcol.processor.resourcedetection "default" { + detectors = ["env", "system"] + + system { + hostname_sources = ["os"] + } + + output { + metrics = [otelcol.processor.transform.add_metric_datapoint_attributes.input] + logs = [otelcol.processor.k8sattributes.default.input] + } +} + +otelcol.processor.k8sattributes "default" { + extract { + metadata = [ + "k8s.namespace.name", + "k8s.pod.name", + "k8s.deployment.name", + "k8s.statefulset.name", + "k8s.daemonset.name", + "k8s.cronjob.name", + "k8s.job.name", + "k8s.node.name", + "k8s.pod.uid", + "k8s.pod.start_time", + ] + } + pod_association { + source { + from = "connection" + } + } + + output { + metrics = [otelcol.processor.transform.default.input] + logs = [otelcol.processor.transform.default.input] + } +} + +otelcol.processor.transform "default" { + // Grafana Cloud Kubernetes monitoring expects Loki labels `cluster`, `pod`, and `namespace` + error_mode = "ignore" + metric_statements { + context = "resource" + statements = [ + "set(attributes[\"k8s.cluster.name\"], \"default-values-test\") where attributes[\"k8s.cluster.name\"] == nil", + ] + } + log_statements { + context = "resource" + statements = [ + "set(attributes[\"pod\"], attributes[\"k8s.pod.name\"])", + "set(attributes[\"namespace\"], attributes[\"k8s.namespace.name\"])", + "set(attributes[\"loki.resource.labels\"], \"pod, namespace, cluster, job\")", + "set(attributes[\"k8s.cluster.name\"], \"default-values-test\") where attributes[\"k8s.cluster.name\"] == nil", + ] + } + output { + metrics = [otelcol.processor.filter.default.input] + logs = [otelcol.processor.filter.default.input] + } +} + +otelcol.processor.filter "default" { + error_mode = "ignore" + + output { + metrics = [otelcol.processor.batch.batch_processor.input] + logs = [otelcol.processor.batch.batch_processor.input] + } +} + +otelcol.processor.batch "batch_processor" { + send_batch_size = 16384 + send_batch_max_size = 0 + timeout = "2s" + output { + metrics = [otelcol.exporter.prometheus.metrics_converter.input] + logs = [otelcol.exporter.loki.logs_converter.input] + } +} +otelcol.exporter.prometheus "metrics_converter" { + forward_to = [prometheus.relabel.metrics_service.receiver] +} +otelcol.exporter.loki "logs_converter" { + forward_to = [loki.process.pod_logs.receiver] +} +// Annotation Autodiscovery +discovery.relabel "annotation_autodiscovery_pods" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_scrape"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_job"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_instance"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_path"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the pod port + // The discovery generates a target for each declared container port of the pod. + // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation. + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portName"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is + // one of the declared ports on that Pod. + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portNumber", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})" + replacement = "[$2]:$1" // IPv6 + target_label = "__address__" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portNumber", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);((([0-9]+?)(\\.|$)){4})" // IPv4, takes priority over IPv6 when both exists + replacement = "$2:$1" + target_label = "__address__" + } + + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_scheme"] + action = "replace" + target_label = "__scheme__" + } +} + +discovery.relabel "annotation_autodiscovery_services" { + targets = discovery.kubernetes.services.targets + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_scrape"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_job"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_instance"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_path"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the service port + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_portName"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_portNumber"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_scheme"] + action = "replace" + target_label = "__scheme__" + } +} + +discovery.relabel "annotation_autodiscovery_http" { + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + rule { + source_labels = ["__scheme__"] + regex = "https" + action = "drop" + } +} + +discovery.relabel "annotation_autodiscovery_https" { + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + rule { + source_labels = ["__scheme__"] + regex = "https" + action = "keep" + } +} + +prometheus.scrape "annotation_autodiscovery_http" { + targets = discovery.relabel.annotation_autodiscovery_http.output + honor_labels = true + clustering { + enabled = true + } + forward_to = [prometheus.relabel.annotation_autodiscovery.receiver] +} + +prometheus.scrape "annotation_autodiscovery_https" { + targets = discovery.relabel.annotation_autodiscovery_https.output + honor_labels = true + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + tls_config { + insecure_skip_verify = true + } + clustering { + enabled = true + } + forward_to = [prometheus.relabel.annotation_autodiscovery.receiver] +} + +prometheus.relabel "annotation_autodiscovery" { + forward_to = [prometheus.relabel.metrics_service.receiver] +} + +// Grafana Alloy +discovery.relabel "alloy" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_instance"] + regex = "k8smon" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"] + regex = "alloy.*" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + regex = "http-metrics" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + target_label = "container" + } +} + +prometheus.scrape "alloy" { + job_name = "integrations/alloy" + targets = discovery.relabel.alloy.output + scrape_interval = "60s" + forward_to = [prometheus.relabel.alloy.receiver] + clustering { + enabled = true + } +} + +prometheus.relabel "alloy" { + rule { + source_labels = ["__name__"] + regex = "up|alloy_build_info" + action = "keep" + } + forward_to = [prometheus.relabel.metrics_service.receiver] +} + +// Kubernetes Monitoring Telemetry +prometheus.exporter.unix "kubernetes_monitoring_telemetry" { + set_collectors = ["textfile"] + textfile { + directory = "/etc/kubernetes-monitoring-telemetry" + } +} + +prometheus.scrape "kubernetes_monitoring_telemetry" { + job_name = "integrations/kubernetes/kubernetes_monitoring_telemetry" + targets = prometheus.exporter.unix.kubernetes_monitoring_telemetry.targets + scrape_interval = "60s" + clustering { + enabled = true + } + forward_to = [prometheus.relabel.kubernetes_monitoring_telemetry.receiver] +} + +prometheus.relabel "kubernetes_monitoring_telemetry" { + rule { + target_label = "job" + action = "replace" + replacement = "integrations/kubernetes/kubernetes_monitoring_telemetry" + } + rule { + target_label = "instance" + action = "replace" + replacement = "k8smon" + } + rule { + source_labels = ["__name__"] + regex = "up|grafana_kubernetes_monitoring_.*" + action = "keep" + } + forward_to = [prometheus.relabel.metrics_service.receiver] +} + +// Kubelet +discovery.relabel "kubelet" { + targets = discovery.kubernetes.nodes.targets +} + +prometheus.scrape "kubelet" { + job_name = "integrations/kubernetes/kubelet" + targets = discovery.relabel.kubelet.output + scheme = "https" + scrape_interval = "60s" + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + tls_config { + insecure_skip_verify = true + } + clustering { + enabled = true + } + forward_to = [prometheus.relabel.kubelet.receiver] +} + +prometheus.relabel "kubelet" { + rule { + source_labels = ["__name__"] + regex = "up|container_cpu_usage_seconds_total|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubernetes_build_info|namespace_workload_pod|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes" + action = "keep" + } + forward_to = [prometheus.relabel.metrics_service.receiver] +} + +// cAdvisor +discovery.relabel "cadvisor" { + targets = discovery.kubernetes.nodes.targets + rule { + replacement = "/metrics/cadvisor" + target_label = "__metrics_path__" + } +} + +prometheus.scrape "cadvisor" { + job_name = "integrations/kubernetes/cadvisor" + targets = discovery.relabel.cadvisor.output + scheme = "https" + scrape_interval = "60s" + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + tls_config { + insecure_skip_verify = true + } + clustering { + enabled = true + } + forward_to = [prometheus.relabel.cadvisor.receiver] +} + +prometheus.relabel "cadvisor" { + rule { + source_labels = ["__name__"] + regex = "up|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|machine_memory_bytes" + action = "keep" + } + // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688 + rule { + source_labels = ["__name__","container"] + separator = "@" + regex = "(container_cpu_.*|container_fs_.*|container_memory_.*)@" + action = "drop" + } + // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688 + rule { + source_labels = ["__name__","image"] + separator = "@" + regex = "(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@" + action = "drop" + } + // Normalizing unimportant labels (not deleting to continue satisfying