diff --git a/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf b/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf index b62d490a3..507002e69 100644 --- a/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf +++ b/modules/smart-agent_elasticsearch/detectors-elasticsearch.tf @@ -35,8 +35,8 @@ resource "signalfx_detector" "cluster_status" { program_text = <<-EOF signal = data('elasticsearch.cluster.status', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('signal') - detect(when(signal == 1)).publish('MAJOR') - detect(when(signal == 2)).publish('CRIT') + detect(when(signal == 1, lasting=%{if var.cluster_status_lasting == null}None%{else}'${var.cluster_status_lasting}'%{endif}, at_least=${var.cluster_status_at_least})).publish('MAJOR') + detect(when(signal == 2, lasting=%{if var.cluster_status_lasting == null}None%{else}'${var.cluster_status_lasting}'%{endif}, at_least=${var.cluster_status_at_least})).publish('CRIT') EOF rule { @@ -75,8 +75,8 @@ resource "signalfx_detector" "cluster_initializing_shards" { program_text = <<-EOF signal = data('elasticsearch.cluster.initializing-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_initializing_shards_aggregation_function}${var.cluster_initializing_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_initializing_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_initializing_shards_threshold_major}) and (not when(signal > ${var.cluster_initializing_shards_threshold_critical}))).publish('MAJOR') + detect(when(signal > ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting == null}None%{else}'${var.cluster_initializing_shards_lasting}'%{endif}, at_least=${var.cluster_initializing_shards_at_least})).publish('CRIT') + detect(when(signal > ${var.cluster_initializing_shards_threshold_major}, lasting=%{if var.cluster_initializing_shards_lasting == null}None%{else}'${var.cluster_initializing_shards_lasting}'%{endif}, at_least=${var.cluster_initializing_shards_at_least}) and (not when(signal > ${var.cluster_initializing_shards_threshold_critical}, lasting=%{if var.cluster_initializing_shards_lasting == null}None%{else}'${var.cluster_initializing_shards_lasting}'%{endif}, at_least=${var.cluster_initializing_shards_at_least}))).publish('MAJOR') EOF rule { @@ -115,8 +115,8 @@ resource "signalfx_detector" "cluster_relocating_shards" { program_text = <<-EOF signal = data('elasticsearch.cluster.relocating-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_relocating_shards_aggregation_function}${var.cluster_relocating_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_relocating_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_relocating_shards_threshold_major}) and (not when(signal > ${var.cluster_relocating_shards_threshold_critical}))).publish('MAJOR') + detect(when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting == null}None%{else}'${var.cluster_relocating_shards_lasting}'%{endif}, at_least=${var.cluster_relocating_shards_at_least})).publish('CRIT') + detect(when(signal > ${var.cluster_relocating_shards_threshold_major}, lasting=%{if var.cluster_relocating_shards_lasting == null}None%{else}'${var.cluster_relocating_shards_lasting}'%{endif}, at_least=${var.cluster_relocating_shards_at_least}) and (not when(signal > ${var.cluster_relocating_shards_threshold_critical}, lasting=%{if var.cluster_relocating_shards_lasting == null}None%{else}'${var.cluster_relocating_shards_lasting}'%{endif}, at_least=${var.cluster_relocating_shards_at_least}))).publish('MAJOR') EOF rule { @@ -155,8 +155,8 @@ resource "signalfx_detector" "cluster_unassigned_shards" { program_text = <<-EOF signal = data('elasticsearch.cluster.unassigned-shards', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.cluster_unassigned_shards_aggregation_function}${var.cluster_unassigned_shards_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster_unassigned_shards_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cluster_unassigned_shards_threshold_major}) and (not when(signal > ${var.cluster_unassigned_shards_threshold_critical}))).publish('MAJOR') + detect(when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting == null}None%{else}'${var.cluster_unassigned_shards_lasting}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least})).publish('CRIT') + detect(when(signal > ${var.cluster_unassigned_shards_threshold_major}, lasting=%{if var.cluster_unassigned_shards_lasting == null}None%{else}'${var.cluster_unassigned_shards_lasting}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least}) and (not when(signal > ${var.cluster_unassigned_shards_threshold_critical}, lasting=%{if var.cluster_unassigned_shards_lasting == null}None%{else}'${var.cluster_unassigned_shards_lasting}'%{endif}, at_least=${var.cluster_unassigned_shards_at_least}))).publish('MAJOR') EOF rule { @@ -195,8 +195,8 @@ resource "signalfx_detector" "pending_tasks" { program_text = <<-EOF signal = data('elasticsearch.cluster.pending-tasks', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average')${var.pending_tasks_aggregation_function}${var.pending_tasks_transformation_function}.publish('signal') - detect(when(signal > ${var.pending_tasks_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.pending_tasks_threshold_major}) and (not when(signal > ${var.pending_tasks_threshold_critical}))).publish('MAJOR') + detect(when(signal > ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_threshold_lasting == null}None%{else}'${var.pending_tasks_threshold_lasting}'%{endif}, at_least=${var.pending_tasks_threshold_at_least})).publish('CRIT') + detect(when(signal > ${var.pending_tasks_threshold_major}, lasting=%{if var.pending_tasks_threshold_lasting == null}None%{else}'${var.pending_tasks_threshold_lasting}'%{endif}, at_least=${var.pending_tasks_threshold_at_least}) and (not when(signal > ${var.pending_tasks_threshold_critical}, lasting=%{if var.pending_tasks_threshold_lasting == null}None%{else}'${var.pending_tasks_threshold_lasting}'%{endif}, at_least=${var.pending_tasks_threshold_at_least}))).publish('MAJOR') EOF rule { @@ -235,8 +235,8 @@ resource "signalfx_detector" "cpu_usage" { program_text = <<-EOF signal = data('elasticsearch.process.cpu.percent', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.cpu_usage_aggregation_function}${var.cpu_usage_transformation_function}.publish('signal') - detect(when(signal > ${var.cpu_usage_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cpu_usage_threshold_major}) and (not when(signal > ${var.cpu_usage_threshold_critical}))).publish('MAJOR') + detect(when(signal > ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting == null}None%{else}'${var.cpu_usage_lasting}'%{endif}, at_least=${var.cpu_usage_at_least})).publish('CRIT') + detect(when(signal > ${var.cpu_usage_threshold_major}, lasting=%{if var.cpu_usage_lasting == null}None%{else}'${var.cpu_usage_lasting}'%{endif}, at_least=${var.cpu_usage_at_least}) and (not when(signal > ${var.cpu_usage_threshold_critical}, lasting=%{if var.cpu_usage_lasting == null}None%{else}'${var.cpu_usage_lasting}'%{endif}, at_least=${var.cpu_usage_at_least}))).publish('MAJOR') EOF rule { @@ -274,11 +274,11 @@ resource "signalfx_detector" "file_descriptors" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.process.open_file_descriptors', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} - B = data('elasticsearch.process.max_file_descriptors', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} + A = data('elasticsearch.process.open_file_descriptors', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} + B = data('elasticsearch.process.max_file_descriptors', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='average')${var.file_descriptors_aggregation_function}${var.file_descriptors_transformation_function} signal = (A/B).scale(100).publish('signal') - detect(when(signal > ${var.file_descriptors_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.file_descriptors_threshold_major}) and (not when(signal > ${var.file_descriptors_threshold_critical}))).publish('MAJOR') + detect(when(signal > ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting == null}None%{else}'${var.file_descriptors_lasting}'%{endif}, at_least=${var.file_descriptors_at_least})).publish('CRIT') + detect(when(signal > ${var.file_descriptors_threshold_major}, lasting=%{if var.file_descriptors_lasting == null}None%{else}'${var.file_descriptors_lasting}'%{endif}, at_least=${var.file_descriptors_at_least}) and (not when(signal > ${var.file_descriptors_threshold_critical}, lasting=%{if var.file_descriptors_lasting == null}None%{else}'${var.file_descriptors_lasting}'%{endif}, at_least=${var.file_descriptors_at_least}))).publish('MAJOR') EOF rule { @@ -317,8 +317,8 @@ resource "signalfx_detector" "jvm_heap_memory_usage" { program_text = <<-EOF signal = data('elasticsearch.jvm.mem.heap-used-percent', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_heap_memory_usage_aggregation_function}${var.jvm_heap_memory_usage_transformation_function}.publish('signal') - detect(when(signal > ${var.jvm_heap_memory_usage_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}) and (not when(signal > ${var.jvm_heap_memory_usage_threshold_critical}))).publish('MAJOR') + detect(when(signal > ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting == null}None%{else}'${var.jvm_heap_memory_usage_lasting}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least})).publish('CRIT') + detect(when(signal > ${var.jvm_heap_memory_usage_threshold_major}, lasting=%{if var.jvm_heap_memory_usage_lasting == null}None%{else}'${var.jvm_heap_memory_usage_lasting}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least}) and (not when(signal > ${var.jvm_heap_memory_usage_threshold_critical}, lasting=%{if var.jvm_heap_memory_usage_lasting == null}None%{else}'${var.jvm_heap_memory_usage_lasting}'%{endif}, at_least=${var.jvm_heap_memory_usage_at_least}))).publish('MAJOR') EOF rule { @@ -356,11 +356,11 @@ resource "signalfx_detector" "jvm_memory_young_usage" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.jvm.mem.pools.young.used_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} - B = data('elasticsearch.jvm.mem.pools.young.max_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} - signal = (A/B).fill(0).scale(100).publish('signal') - detect(when(signal > ${var.jvm_memory_young_usage_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_memory_young_usage_threshold_minor}) and (not when(signal > ${var.jvm_memory_young_usage_threshold_major}))).publish('MINOR') + A = data('elasticsearch.jvm.mem.pools.young.used_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} + B = data('elasticsearch.jvm.mem.pools.young.max_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='average')${var.jvm_memory_young_usage_aggregation_function}${var.jvm_memory_young_usage_transformation_function} + signal = (A/B).scale(100).publish('signal') + detect(when(signal > ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting == null}None%{else}'${var.jvm_memory_young_usage_lasting}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least})).publish('MAJOR') + detect(when(signal > ${var.jvm_memory_young_usage_threshold_minor}, lasting=%{if var.jvm_memory_young_usage_lasting == null}None%{else}'${var.jvm_memory_young_usage_lasting}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least}) and (not when(signal > ${var.jvm_memory_young_usage_threshold_major}, lasting=%{if var.jvm_memory_young_usage_lasting == null}None%{else}'${var.jvm_memory_young_usage_lasting}'%{endif}, at_least=${var.jvm_memory_young_usage_at_least}))).publish('MINOR') EOF rule { @@ -398,11 +398,11 @@ resource "signalfx_detector" "jvm_memory_old_usage" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - A = data('elasticsearch.jvm.mem.pools.old.used_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} - B = data('elasticsearch.jvm.mem.pools.old.max_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} - signal = (A/B).fill(0).scale(100).publish('signal') - detect(when(signal > ${var.jvm_memory_old_usage_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_memory_old_usage_threshold_minor}) and (not when(signal > ${var.jvm_memory_old_usage_threshold_major}))).publish('MINOR') + A = data('elasticsearch.jvm.mem.pools.old.used_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} + B = data('elasticsearch.jvm.mem.pools.old.max_in_bytes', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='average')${var.jvm_memory_old_usage_aggregation_function}${var.jvm_memory_old_usage_transformation_function} + signal = (A/B).scale(100).publish('signal') + detect(when(signal > ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting == null}None%{else}'${var.jvm_memory_old_usage_lasting}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least})).publish('MAJOR') + detect(when(signal > ${var.jvm_memory_old_usage_threshold_minor}, lasting=%{if var.jvm_memory_old_usage_lasting == null}None%{else}'${var.jvm_memory_old_usage_lasting}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least}) and (not when(signal > ${var.jvm_memory_old_usage_threshold_major}, lasting=%{if var.jvm_memory_old_usage_lasting == null}None%{else}'${var.jvm_memory_old_usage_lasting}'%{endif}, at_least=${var.jvm_memory_old_usage_at_least}))).publish('MINOR') EOF rule { @@ -441,10 +441,10 @@ resource "signalfx_detector" "jvm_gc_old_collection_latency" { program_text = <<-EOF A = data('elasticsearch.jvm.gc.old-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} - B = data('elasticsearch.jvm.gc.old-count', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_minor}) and (not when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}))).publish('MINOR') + B = data('elasticsearch.jvm.gc.old-count', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='delta')${var.jvm_gc_old_collection_latency_aggregation_function}${var.jvm_gc_old_collection_latency_transformation_function} + signal = (A/B).publish('signal') + detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least})).publish('MAJOR') + detect(when(signal > ${var.jvm_gc_old_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_old_collection_latency_lasting == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least}) and (not when(signal > ${var.jvm_gc_old_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_old_collection_latency_lasting == null}None%{else}'${var.jvm_gc_old_collection_latency_lasting}'%{endif}, at_least=${var.jvm_gc_old_collection_latency_at_least}))).publish('MINOR') EOF rule { @@ -483,10 +483,10 @@ resource "signalfx_detector" "jvm_gc_young_collection_latency" { program_text = <<-EOF A = data('elasticsearch.jvm.gc.time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} - B = data('elasticsearch.jvm.gc.count', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_minor}) and (not when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}))).publish('MINOR') + B = data('elasticsearch.jvm.gc.count', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='delta')${var.jvm_gc_young_collection_latency_aggregation_function}${var.jvm_gc_young_collection_latency_transformation_function} + signal = (A/B).publish('signal') + detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least})).publish('MAJOR') + detect(when(signal > ${var.jvm_gc_young_collection_latency_threshold_minor}, lasting=%{if var.jvm_gc_young_collection_latency_lasting == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least}) and (not when(signal > ${var.jvm_gc_young_collection_latency_threshold_major}, lasting=%{if var.jvm_gc_young_collection_latency_lasting == null}None%{else}'${var.jvm_gc_young_collection_latency_lasting}'%{endif}, at_least=${var.jvm_gc_young_collection_latency_at_least}))).publish('MINOR') EOF rule { @@ -525,10 +525,10 @@ resource "signalfx_detector" "indexing_latency" { program_text = <<-EOF A = data('elasticsearch.indices.indexing.index-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} - B = data('elasticsearch.indices.indexing.index-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.indexing_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.indexing_latency_threshold_minor}) and (not when(signal > ${var.indexing_latency_threshold_major}))).publish('MINOR') + B = data('elasticsearch.indices.indexing.index-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='delta')${var.indexing_latency_aggregation_function}${var.indexing_latency_transformation_function} + signal = (A/B).publish('signal') + detect(when(signal > ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting == null}None%{else}'${var.indexing_latency_lasting}'%{endif}, at_least=${var.indexing_latency_at_least})).publish('MAJOR') + detect(when(signal > ${var.indexing_latency_threshold_minor}, lasting=%{if var.indexing_latency_lasting == null}None%{else}'${var.indexing_latency_lasting}'%{endif}, at_least=${var.indexing_latency_at_least}) and (not when(signal > ${var.indexing_latency_threshold_major}, lasting=%{if var.indexing_latency_lasting == null}None%{else}'${var.indexing_latency_lasting}'%{endif}, at_least=${var.indexing_latency_at_least}))).publish('MINOR') EOF rule { @@ -567,10 +567,10 @@ resource "signalfx_detector" "flush_latency" { program_text = <<-EOF A = data('elasticsearch.indices.flush.total-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} - B = data('elasticsearch.indices.flush.total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.flush_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.flush_latency_threshold_minor}) and (not when(signal > ${var.flush_latency_threshold_major}))).publish('MINOR') + B = data('elasticsearch.indices.flush.total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='delta')${var.flush_latency_aggregation_function}${var.flush_latency_transformation_function} + signal = (A/B).publish('signal') + detect(when(signal > ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting == null}None%{else}'${var.flush_latency_lasting}'%{endif}, at_least=${var.flush_latency_at_least})).publish('MAJOR') + detect(when(signal > ${var.flush_latency_threshold_minor}, lasting=%{if var.flush_latency_lasting == null}None%{else}'${var.flush_latency_lasting}'%{endif}, at_least=${var.flush_latency_at_least}) and (not when(signal > ${var.flush_latency_threshold_major}, lasting=%{if var.flush_latency_lasting == null}None%{else}'${var.flush_latency_lasting}'%{endif}, at_least=${var.flush_latency_at_least}))).publish('MINOR') EOF rule { @@ -609,10 +609,10 @@ resource "signalfx_detector" "search_latency" { program_text = <<-EOF A = data('elasticsearch.indices.search.query-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} - B = data('elasticsearch.indices.search.query-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.search_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.search_latency_threshold_minor}) and (not when(signal > ${var.search_latency_threshold_major}))).publish('MINOR') + B = data('elasticsearch.indices.search.query-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='delta')${var.search_latency_aggregation_function}${var.search_latency_transformation_function} + signal = (A/B).publish('signal') + detect(when(signal > ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting == null}None%{else}'${var.search_latency_lasting}'%{endif}, at_least=${var.search_latency_at_least})).publish('MAJOR') + detect(when(signal > ${var.search_latency_threshold_minor}, lasting=%{if var.search_latency_lasting == null}None%{else}'${var.search_latency_lasting}'%{endif}, at_least=${var.search_latency_at_least}) and (not when(signal > ${var.search_latency_threshold_major}, lasting=%{if var.search_latency_lasting == null}None%{else}'${var.search_latency_lasting}'%{endif}, at_least=${var.search_latency_at_least}))).publish('MINOR') EOF rule { @@ -651,10 +651,10 @@ resource "signalfx_detector" "fetch_latency" { program_text = <<-EOF A = data('elasticsearch.indices.search.fetch-time', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} - B = data('elasticsearch.indices.search.fetch-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} - signal = (A/B).fill(0).publish('signal') - detect(when(signal > ${var.fetch_latency_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.fetch_latency_threshold_minor}) and (not when(signal > ${var.fetch_latency_threshold_major}))).publish('MINOR') + B = data('elasticsearch.indices.search.fetch-total', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='last_value', rollup='delta')${var.fetch_latency_aggregation_function}${var.fetch_latency_transformation_function} + signal = (A/B).publish('signal') + detect(when(signal > ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting == null}None%{else}'${var.fetch_latency_lasting}'%{endif}, at_least=${var.fetch_latency_at_least})).publish('MAJOR') + detect(when(signal > ${var.fetch_latency_threshold_minor}, lasting=%{if var.fetch_latency_lasting == null}None%{else}'${var.fetch_latency_lasting}'%{endif}, at_least=${var.fetch_latency_at_least}) and (not when(signal > ${var.fetch_latency_threshold_major}, lasting=%{if var.fetch_latency_lasting == null}None%{else}'${var.fetch_latency_lasting}'%{endif}, at_least=${var.fetch_latency_at_least}))).publish('MINOR') EOF rule { @@ -693,8 +693,8 @@ resource "signalfx_detector" "field_data_evictions_change" { program_text = <<-EOF signal = data('elasticsearch.indices.fielddata.evictions', filter=filter('plugin', 'elasticsearch') and filter('node_name', '*') and ${module.filtering.signalflow}, extrapolation='zero', rollup='delta').rateofchange()${var.field_data_evictions_change_aggregation_function}${var.field_data_evictions_change_transformation_function}.publish('signal') - detect(when(signal > ${var.field_data_evictions_change_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.field_data_evictions_change_threshold_minor}) and (not when(signal > ${var.field_data_evictions_change_threshold_major}))).publish('MINOR') + detect(when(signal > ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting == null}None%{else}'${var.field_data_evictions_change_lasting}'%{endif}, at_least=${var.field_data_evictions_change_at_least})).publish('MAJOR') + detect(when(signal > ${var.field_data_evictions_change_threshold_minor}, lasting=%{if var.field_data_evictions_change_lasting == null}None%{else}'${var.field_data_evictions_change_lasting}'%{endif}, at_least=${var.field_data_evictions_change_at_least}) and (not when(signal > ${var.field_data_evictions_change_threshold_major}, lasting=%{if var.field_data_evictions_change_lasting == null}None%{else}'${var.field_data_evictions_change_lasting}'%{endif}, at_least=${var.field_data_evictions_change_at_least}))).publish('MINOR') EOF rule { @@ -733,8 +733,8 @@ resource "signalfx_detector" "task_time_in_queue_change" { program_text = <<-EOF signal = data('elasticsearch.cluster.task-max-wait-time', filter=filter('plugin', 'elasticsearch') and ${module.filtering.signalflow}, rollup='average').rateofchange()${var.task_time_in_queue_change_aggregation_function}${var.task_time_in_queue_change_transformation_function}.publish('signal') - detect(when(signal > ${var.task_time_in_queue_change_threshold_major})).publish('MAJOR') - detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}) and (not when(signal > ${var.task_time_in_queue_change_threshold_major}))).publish('MINOR') + detect(when(signal > ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting == null}None%{else}'${var.task_time_in_queue_change_lasting}'%{endif}, at_least=${var.task_time_in_queue_change_at_least})).publish('MAJOR') + detect(when(signal > ${var.task_time_in_queue_change_threshold_minor}, lasting=%{if var.task_time_in_queue_change_lasting == null}None%{else}'${var.task_time_in_queue_change_lasting}'%{endif}, at_least=${var.task_time_in_queue_change_at_least}) and (not when(signal > ${var.task_time_in_queue_change_threshold_major}, lasting=%{if var.task_time_in_queue_change_lasting == null}None%{else}'${var.task_time_in_queue_change_lasting}'%{endif}, at_least=${var.task_time_in_queue_change_at_least}))).publish('MINOR') EOF rule { @@ -763,4 +763,3 @@ EOF max_delay = var.task_time_in_queue_change_max_delay } - diff --git a/modules/smart-agent_elasticsearch/variables.tf b/modules/smart-agent_elasticsearch/variables.tf index 3cf6ee9bb..d609b88b5 100644 --- a/modules/smart-agent_elasticsearch/variables.tf +++ b/modules/smart-agent_elasticsearch/variables.tf @@ -97,7 +97,19 @@ variable "cluster_status_aggregation_function" { variable "cluster_status_transformation_function" { description = "Transformation function for cluster_status_not_green detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='5m')" + default = "" +} + +variable "cluster_status_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "cluster_status_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } # Cluster_initializing_shards detector @@ -153,7 +165,19 @@ variable "cluster_initializing_shards_aggregation_function" { variable "cluster_initializing_shards_transformation_function" { description = "Transformation function for cluster_initializing_shards detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" +} + +variable "cluster_initializing_shards_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "cluster_initializing_shards_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } variable "cluster_initializing_shards_threshold_critical" { @@ -221,7 +245,19 @@ variable "cluster_relocating_shards_aggregation_function" { variable "cluster_relocating_shards_transformation_function" { description = "Transformation function for cluster_relocating_shards detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" +} + +variable "cluster_relocating_shards_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "cluster_relocating_shards_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } variable "cluster_relocating_shards_threshold_critical" { @@ -289,7 +325,19 @@ variable "cluster_unassigned_shards_aggregation_function" { variable "cluster_unassigned_shards_transformation_function" { description = "Transformation function for cluster_unassigned_shards detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='10m')" + default = "" +} + +variable "cluster_unassigned_shards_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" +} + +variable "cluster_unassigned_shards_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } variable "cluster_unassigned_shards_threshold_critical" { @@ -357,7 +405,19 @@ variable "pending_tasks_aggregation_function" { variable "pending_tasks_transformation_function" { description = "Transformation function for pending_tasks detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" +} + +variable "pending_tasks_threshold_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "pending_tasks_threshold_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } variable "pending_tasks_threshold_critical" { @@ -425,7 +485,19 @@ variable "jvm_heap_memory_usage_aggregation_function" { variable "jvm_heap_memory_usage_transformation_function" { description = "Transformation function for jvm_heap_memory_usage detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='5m')" + default = "" +} + +variable "jvm_heap_memory_usage_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "jvm_heap_memory_usage_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "jvm_heap_memory_usage_threshold_critical" { @@ -493,7 +565,19 @@ variable "cpu_usage_aggregation_function" { variable "cpu_usage_transformation_function" { description = "Transformation function for cpu_usage detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='30m')" + default = "" +} + +variable "cpu_usage_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "30m" +} + +variable "cpu_usage_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } variable "cpu_usage_threshold_critical" { @@ -561,7 +645,7 @@ variable "file_descriptors_aggregation_function" { variable "file_descriptors_transformation_function" { description = "Transformation function for file_descriptors detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='15m')" + default = "" } variable "file_descriptors_threshold_critical" { @@ -576,6 +660,18 @@ variable "file_descriptors_threshold_major" { default = 90 } +variable "file_descriptors_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "file_descriptors_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} + # Jvm_memory_young_usage detector variable "jvm_memory_young_usage_max_delay" { @@ -629,7 +725,19 @@ variable "jvm_memory_young_usage_aggregation_function" { variable "jvm_memory_young_usage_transformation_function" { description = "Transformation function for jvm_memory_young_usage detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='10m')" + default = "" +} + +variable "jvm_memory_young_usage_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" +} + +variable "jvm_memory_young_usage_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "jvm_memory_young_usage_threshold_major" { @@ -697,7 +805,19 @@ variable "jvm_memory_old_usage_aggregation_function" { variable "jvm_memory_old_usage_transformation_function" { description = "Transformation function for jvm_memory_old_usage detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='10m')" + default = "" +} + +variable "jvm_memory_old_usage_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" +} + +variable "jvm_memory_old_usage_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "jvm_memory_old_usage_threshold_major" { @@ -765,7 +885,19 @@ variable "jvm_gc_old_collection_latency_aggregation_function" { variable "jvm_gc_old_collection_latency_transformation_function" { description = "Transformation function for jvm_gc_old_collection_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" +} + +variable "jvm_gc_old_collection_latency_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "jvm_gc_old_collection_latency_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "jvm_gc_old_collection_latency_threshold_major" { @@ -833,7 +965,19 @@ variable "jvm_gc_young_collection_latency_aggregation_function" { variable "jvm_gc_young_collection_latency_transformation_function" { description = "Transformation function for jvm_gc_young_collection_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" +} + +variable "jvm_gc_young_collection_latency_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "jvm_gc_young_collection_latency_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "jvm_gc_young_collection_latency_threshold_major" { @@ -901,7 +1045,19 @@ variable "indexing_latency_aggregation_function" { variable "indexing_latency_transformation_function" { description = "Transformation function for indexing_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" +} + +variable "indexing_latency_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "indexing_latency_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "indexing_latency_threshold_major" { @@ -969,7 +1125,19 @@ variable "flush_latency_aggregation_function" { variable "flush_latency_transformation_function" { description = "Transformation function for flush_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" +} + +variable "flush_latency_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "flush_latency_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "flush_latency_threshold_major" { @@ -1037,7 +1205,19 @@ variable "search_latency_aggregation_function" { variable "search_latency_transformation_function" { description = "Transformation function for search_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='30m')" + default = "" +} + +variable "search_latency_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "30m" +} + +variable "search_latency_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } variable "search_latency_threshold_major" { @@ -1105,7 +1285,19 @@ variable "fetch_latency_aggregation_function" { variable "fetch_latency_transformation_function" { description = "Transformation function for fetch_latency detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" +} + +variable "fetch_latency_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "fetch_latency_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } variable "fetch_latency_threshold_major" { @@ -1173,7 +1365,19 @@ variable "field_data_evictions_change_aggregation_function" { variable "field_data_evictions_change_transformation_function" { description = "Transformation function for field_data_evictions_change detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" +} + +variable "field_data_evictions_change_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "field_data_evictions_change_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "field_data_evictions_change_threshold_major" { @@ -1241,7 +1445,19 @@ variable "task_time_in_queue_change_aggregation_function" { variable "task_time_in_queue_change_transformation_function" { description = "Transformation function for task_time_in_queue_change detector (i.e. \".mean(over='5m')\")" type = string - default = ".mean(over='15m')" + default = "" +} + +variable "task_time_in_queue_change_lasting" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "task_time_in_queue_change_at_least" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.5 } variable "task_time_in_queue_change_threshold_major" { @@ -1255,4 +1471,3 @@ variable "task_time_in_queue_change_threshold_minor" { type = number default = 100 } -