diff --git a/jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet b/jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet index ab6f29434e..e1f628a7e4 100644 --- a/jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet +++ b/jsonnet/kube-prometheus/addons/insecure-kubelet.libsonnet @@ -1,40 +1,14 @@ { prometheus+: { - serviceMonitorKubelet+: - { - spec+: { - endpoints: [ - { - port: 'http-metrics', - scheme: 'http', - interval: '30s', - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { sourceLabels: ['__metrics_path__'], targetLabel: 'metrics_path' }, - ], - }, - { - port: 'http-metrics', - scheme: 'http', - path: '/metrics/cadvisor', - interval: '30s', - honorLabels: true, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { sourceLabels: ['__metrics_path__'], targetLabel: 'metrics_path' }, - ], - metricRelabelings: [ - // Drop a bunch of metrics which are disabled but still sent, see - // https://github.com/google/cadvisor/issues/1925. - { - sourceLabels: ['__name__'], - regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', - action: 'drop', - }, - ], - }, - ], - }, + scrapeConfigKubelet+: { + spec+: { + scheme: 'http', }, + }, + scrapeConfigKubeletCadvisor+: { + spec+: { + scheme: 'http', + }, + }, }, } diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet index a771e95dbe..fda07be6d4 100644 --- a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -26,6 +26,7 @@ local defaults = { }, }, kubeProxy:: false, + prometheusServiceAccountTokenSecretName: 'prometheus-k8s-token', }; function(params) { @@ -87,102 +88,114 @@ function(params) { }, }, - serviceMonitorKubelet: { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', + scrapeConfigKubelet: { + apiVersion: 'monitoring.coreos.com/v1alpha1', + kind: 'ScrapeConfig', metadata: k8s._metadata { name: 'kubelet', labels+: { 'app.kubernetes.io/name': 'kubelet' }, }, spec: { - jobLabel: 'app.kubernetes.io/name', - endpoints: [ + authorization: { + credentials: { + key: 'token', + name: k8s._config.prometheusServiceAccountTokenSecretName, + }, + type: 'Bearer', + }, + honorLabels: true, + kubernetesSDConfigs: [{ role: 'Node' }], + metricRelabelings: relabelings, + metricsPath: '/metrics', + // Majority of those relabelings are here to preserve as much backwards compatibility as possible + // with the old ServiceMonitor scrape configuration. + relabelings: [ { - port: 'https-metrics', - scheme: 'https', - interval: '30s', - honorLabels: true, - tlsConfig: { insecureSkipVerify: true }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - metricRelabelings: relabelings, - relabelings: [{ - action: 'replace', - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }], + action: 'replace', + sourceLabels: ['__metrics_path__'], + targetLabel: 'metrics_path', }, { - port: 'https-metrics', - scheme: 'https', - path: '/metrics/cadvisor', - interval: '30s', - honorLabels: true, - honorTimestamps: false, - tlsConfig: { - insecureSkipVerify: true, - }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [{ - action: 'replace', - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }], - metricRelabelings: [ - // Drop a bunch of metrics which are disabled but still sent, see - // https://github.com/google/cadvisor/issues/1925. - { - sourceLabels: ['__name__'], - regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', - action: 'drop', - }, - // Drop cAdvisor metrics with no (pod, namespace) labels while preserving ability to monitor system services resource usage (cardinality estimation) - { - sourceLabels: ['__name__', 'pod', 'namespace'], - action: 'drop', - regex: '(' + std.join('|', - [ - 'container_spec_.*', // everything related to cgroup specification and thus static data (nodes*services*5) - 'container_file_descriptors', // file descriptors limits and global numbers are exposed via (nodes*services) - 'container_sockets', // used sockets in cgroup. Usually not important for system services (nodes*services) - 'container_threads_max', // max number of threads in cgroup. Usually for system services it is not limited (nodes*services) - 'container_threads', // used threads in cgroup. Usually not important for system services (nodes*services) - 'container_start_time_seconds', // container start. Possibly not needed for system services (nodes*services) - 'container_last_seen', // not needed as system services are always running (nodes*services) - ]) + ');;', - }, - { - sourceLabels: ['__name__', 'container'], - action: 'drop', - regex: '(' + std.join('|', - [ - 'container_blkio_device_usage_total', - ]) + ');.+', - }, - ], + action: 'replace', + replacement: 'kube-system', + targetLabel: 'namespace', }, { - port: 'https-metrics', - scheme: 'https', - path: '/metrics/probes', - interval: '30s', - honorLabels: true, - tlsConfig: { insecureSkipVerify: true }, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [{ - action: 'replace', - sourceLabels: ['__metrics_path__'], - targetLabel: 'metrics_path', - }], + action: 'replace', + sourceLabels: ['__meta_kubernetes_node_name'], + targetLabel: 'node', + }, + { + targetLabel: 'job', + replacement: 'kubelet', }, ], - selector: { - matchLabels: { 'app.kubernetes.io/name': 'kubelet' }, - }, - namespaceSelector: { - matchNames: ['kube-system'], + scheme: 'HTTPS', + scrapeInterval: '30s', + tlsConfig: { + insecureSkipVerify: true, }, }, }, + scrapeConfigKubeletCadvisor: k8s.scrapeConfigKubelet { + metadata+: { + name: 'kubelet-cadvisor', + }, + spec+: { + honorTimestamps: false, + metricsPath: '/metrics/cadvisor', + }, + }, + scrapeConfigKubeletProbes: k8s.scrapeConfigKubelet { + metadata+: { + name: 'kubelet-probes', + }, + spec+: { + metricsPath: '/metrics/probes', + metricRelabelings: [ + // Drop a bunch of metrics which are disabled but still sent, see + // https://github.com/google/cadvisor/issues/1925. + { + sourceLabels: ['__name__'], + regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', + action: 'drop', + }, + // Drop cAdvisor metrics with no (pod, namespace) labels while preserving ability to monitor system services resource usage (cardinality estimation) + { + sourceLabels: ['__name__', 'pod', 'namespace'], + action: 'drop', + regex: '(' + std.join('|', + [ + 'container_spec_.*', // everything related to cgroup specification and thus static data (nodes*services*5) + 'container_file_descriptors', // file descriptors limits and global numbers are exposed via (nodes*services) + 'container_sockets', // used sockets in cgroup. Usually not important for system services (nodes*services) + 'container_threads_max', // max number of threads in cgroup. Usually for system services it is not limited (nodes*services) + 'container_threads', // used threads in cgroup. Usually not important for system services (nodes*services) + 'container_start_time_seconds', // container start. Possibly not needed for system services (nodes*services) + 'container_last_seen', // not needed as system services are always running (nodes*services) + ]) + ');;', + }, + { + sourceLabels: ['__name__', 'container'], + action: 'drop', + regex: '(' + std.join('|', + [ + 'container_blkio_device_usage_total', + ]) + ');.+', + }, + ], + }, + }, + /*scrapeConfigKubeletSLIs: k8s.scrapeConfigKubelet { + metadata+: { + name: 'kubelet-slis', + }, + spec+: { + metricsPath: '/metrics/slis', + scrapeInterval: '5s', + scrapeTimeout: '5s', + }, + },*/ serviceMonitorKubeControllerManager: { apiVersion: 'monitoring.coreos.com/v1', diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet index 3405c8f3e3..3b9eb5c62e 100644 --- a/jsonnet/kube-prometheus/main.libsonnet +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -112,7 +112,7 @@ local utils = import './lib/utils.libsonnet'; image: $.values.common.images.prometheusAdapter, prometheusURL: 'http://prometheus-' + $.values.prometheus.name + '.' + $.values.prometheus.namespace + '.svc:9090/', rangeIntervals+: { - kubelet: utils.rangeInterval($.kubernetesControlPlane.serviceMonitorKubelet.spec.endpoints[0].interval), + kubelet: utils.rangeInterval($.kubernetesControlPlane.scrapeConfigKubelet.spec.scrapeInterval), nodeExporter: utils.rangeInterval($.nodeExporter.serviceMonitor.spec.endpoints[0].interval), }, }, @@ -127,6 +127,7 @@ local utils = import './lib/utils.libsonnet'; kubernetesControlPlane: { namespace: $.values.common.namespace, mixin+: { ruleLabels: $.values.common.ruleLabels }, + prometheusServiceAccountTokenSecretName: 'prometheus-' + $.values.prometheus.name + '-token', }, }, diff --git a/kustomization.yaml b/kustomization.yaml index e0d8039ed7..cf790be4d8 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -37,11 +37,13 @@ resources: - ./manifests/kubeStateMetrics-serviceAccount.yaml - ./manifests/kubeStateMetrics-serviceMonitor.yaml - ./manifests/kubernetesControlPlane-prometheusRule.yaml +- ./manifests/kubernetesControlPlane-scrapeConfigKubelet.yaml +- ./manifests/kubernetesControlPlane-scrapeConfigKubeletCadvisor.yaml +- ./manifests/kubernetesControlPlane-scrapeConfigKubeletProbes.yaml - ./manifests/kubernetesControlPlane-serviceMonitorApiserver.yaml - ./manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml - ./manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml - ./manifests/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml -- ./manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml - ./manifests/nodeExporter-clusterRole.yaml - ./manifests/nodeExporter-clusterRoleBinding.yaml - ./manifests/nodeExporter-daemonset.yaml diff --git a/manifests/kubernetesControlPlane-scrapeConfigKubelet.yaml b/manifests/kubernetesControlPlane-scrapeConfigKubelet.yaml new file mode 100644 index 0000000000..9e62d9fd16 --- /dev/null +++ b/manifests/kubernetesControlPlane-scrapeConfigKubelet.yaml @@ -0,0 +1,69 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: kube-prometheus + name: kubelet + namespace: monitoring +spec: + authorization: + credentials: + key: token + name: prometheus-k8s-token + type: Bearer + honorLabels: true + kubernetesSDConfigs: + - role: Node + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + metricsPath: /metrics + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - action: replace + replacement: kube-system + targetLabel: namespace + - action: replace + sourceLabels: + - __meta_kubernetes_node_name + targetLabel: node + - replacement: kubelet + targetLabel: job + scheme: HTTPS + scrapeInterval: 30s + tlsConfig: + insecureSkipVerify: true diff --git a/manifests/kubernetesControlPlane-scrapeConfigKubeletCadvisor.yaml b/manifests/kubernetesControlPlane-scrapeConfigKubeletCadvisor.yaml new file mode 100644 index 0000000000..f01a583bc1 --- /dev/null +++ b/manifests/kubernetesControlPlane-scrapeConfigKubeletCadvisor.yaml @@ -0,0 +1,70 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: kube-prometheus + name: kubelet-cadvisor + namespace: monitoring +spec: + authorization: + credentials: + key: token + name: prometheus-k8s-token + type: Bearer + honorLabels: true + honorTimestamps: false + kubernetesSDConfigs: + - role: Node + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + metricsPath: /metrics/cadvisor + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - action: replace + replacement: kube-system + targetLabel: namespace + - action: replace + sourceLabels: + - __meta_kubernetes_node_name + targetLabel: node + - replacement: kubelet + targetLabel: job + scheme: HTTPS + scrapeInterval: 30s + tlsConfig: + insecureSkipVerify: true diff --git a/manifests/kubernetesControlPlane-scrapeConfigKubeletProbes.yaml b/manifests/kubernetesControlPlane-scrapeConfigKubeletProbes.yaml new file mode 100644 index 0000000000..8a20a09824 --- /dev/null +++ b/manifests/kubernetesControlPlane-scrapeConfigKubeletProbes.yaml @@ -0,0 +1,52 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: kube-prometheus + name: kubelet-probes + namespace: monitoring +spec: + authorization: + credentials: + key: token + name: prometheus-k8s-token + type: Bearer + honorLabels: true + kubernetesSDConfigs: + - role: Node + metricRelabelings: + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ + - action: drop + regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);; + sourceLabels: + - __name__ + - pod + - namespace + - action: drop + regex: (container_blkio_device_usage_total);.+ + sourceLabels: + - __name__ + - container + metricsPath: /metrics/probes + relabelings: + - action: replace + sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - action: replace + replacement: kube-system + targetLabel: namespace + - action: replace + sourceLabels: + - __meta_kubernetes_node_name + targetLabel: node + - replacement: kubelet + targetLabel: job + scheme: HTTPS + scrapeInterval: 30s + tlsConfig: + insecureSkipVerify: true diff --git a/manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml b/manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml deleted file mode 100644 index 96bbdbab72..0000000000 --- a/manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml +++ /dev/null @@ -1,105 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - app.kubernetes.io/name: kubelet - app.kubernetes.io/part-of: kube-prometheus - name: kubelet - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers|storage_db_total_size_in_bytes) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) - sourceLabels: - - __name__ - port: https-metrics - relabelings: - - action: replace - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - honorTimestamps: false - interval: 30s - metricRelabelings: - - action: drop - regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) - sourceLabels: - - __name__ - - action: drop - regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);; - sourceLabels: - - __name__ - - pod - - namespace - - action: drop - regex: (container_blkio_device_usage_total);.+ - sourceLabels: - - __name__ - - container - path: /metrics/cadvisor - port: https-metrics - relabelings: - - action: replace - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - interval: 30s - path: /metrics/probes - port: https-metrics - relabelings: - - action: replace - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - jobLabel: app.kubernetes.io/name - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - app.kubernetes.io/name: kubelet