|
| 1 | +import logging |
| 2 | +import queue |
| 3 | +from wsgiref import simple_server |
| 4 | + |
| 5 | +import prometheus_client |
| 6 | + |
| 7 | + |
| 8 | +def inc(metrics_queue, name, value=1, labels=None): |
| 9 | + |
| 10 | + metrics_queue.put(MetricsMessage(name, 'inc', value, labels)) |
| 11 | + |
| 12 | + |
| 13 | +def dec(metrics_queue, name, value=1, labels=None): |
| 14 | + |
| 15 | + metrics_queue.put(MetricsMessage(name, 'dec', value, labels)) |
| 16 | + |
| 17 | + |
| 18 | +def set(metrics_queue, name, value, labels=None): # pylint: disable=redefined-builtin |
| 19 | + |
| 20 | + metrics_queue.put(MetricsMessage(name, 'set', value, labels)) |
| 21 | + |
| 22 | + |
| 23 | +def observe(metrics_queue, name, value, labels=None): |
| 24 | + |
| 25 | + metrics_queue.put(MetricsMessage(name, 'observe', value, labels)) |
| 26 | + |
| 27 | + |
| 28 | +class MetricsMessage: |
| 29 | + """ |
| 30 | + Message to put into run_collector()'s queue for recording metric changes. |
| 31 | + """ |
| 32 | + |
| 33 | + def __init__(self, name, instruction, value, labels=None): |
| 34 | + self.name = name |
| 35 | + self.instruction = instruction |
| 36 | + self.value = value |
| 37 | + |
| 38 | + if labels is None: |
| 39 | + self.labels = {} |
| 40 | + else: |
| 41 | + self.labels = labels |
| 42 | + |
| 43 | + |
| 44 | +class HTTPGenMessage: |
| 45 | + """ |
| 46 | + Message to put into run_collector()'s queue for receiving a text representation of its metrics (for HTTP |
| 47 | + export) through its pipe. |
| 48 | + """ |
| 49 | + |
| 50 | + |
| 51 | +def checker_metrics_factory(registry): |
| 52 | + |
| 53 | + metrics = {} |
| 54 | + metric_prefix = 'ctf_checkermaster_' |
| 55 | + |
| 56 | + counters = [ |
| 57 | + ('started_tasks', 'Number of started Checker Script instances', []), |
| 58 | + ('completed_tasks', 'Number of successfully completed checks', ['result']), |
| 59 | + ('terminated_tasks', 'Number of Checker Script instances forcibly terminated', []) |
| 60 | + ] |
| 61 | + for name, doc, labels in counters: |
| 62 | + metrics[name] = prometheus_client.Counter(metric_prefix+name, doc, labels+['service'], |
| 63 | + registry=registry) |
| 64 | + |
| 65 | + gauges = [ |
| 66 | + ('start_timestamp', '(Unix timestamp when the process was started', []), |
| 67 | + ('interval_length_seconds', 'Configured launch interval length', []), |
| 68 | + ('last_launch_timestamp', '(Unix) timestamp when tasks were launched the last time', []), |
| 69 | + ('tasks_per_launch_count', 'Number of checks to start in one launch interval', []), |
| 70 | + ('max_task_duration_seconds', 'Currently estimated maximum runtime of one check', []) |
| 71 | + ] |
| 72 | + for name, doc, labels in gauges: |
| 73 | + metrics[name] = prometheus_client.Gauge(metric_prefix+name, doc, labels+['service'], |
| 74 | + registry=registry) |
| 75 | + |
| 76 | + histograms = [ |
| 77 | + ('task_launch_delay_seconds', 'Differences between supposed and actual task launch times', [], |
| 78 | + (0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10, 30, 60, float('inf'))), |
| 79 | + ('script_duration_seconds', 'Observed runtimes of Checker Scripts', [], |
| 80 | + (1, 3, 5, 8, 10, 20, 30, 45, 60, 90, 120, 150, 180, 240, 300, float('inf'))) |
| 81 | + ] |
| 82 | + for name, doc, labels, buckets in histograms: |
| 83 | + metrics[name] = prometheus_client.Histogram(metric_prefix+name, doc, labels+['service'], |
| 84 | + buckets=buckets, registry=registry) |
| 85 | + |
| 86 | + return metrics |
| 87 | + |
| 88 | + |
| 89 | +def run_collector(service, metrics_factory, in_queue, pipe_to_server): |
| 90 | + """ |
| 91 | + Manages Prometheus metrics. Receives changes to the metrics through a queue and emits their text |
| 92 | + representation (for HTTP export) over a pipe. Designed to be run as "target" in a multiprocessing.Process |
| 93 | + in conjunction with run_http_server(). |
| 94 | +
|
| 95 | + Args: |
| 96 | + service: Slug of this checker instance's service. |
| 97 | + metrics_factory: Callable returning a dict of the mtrics to use mapping from name to Metric object. |
| 98 | + in_queue: Queue over which MetricsMessages and HTTPGenMessages are received. |
| 99 | + pipe_to_server: Pipe to which text representations of the metrics are sent in response to |
| 100 | + HTTPGenMessages. |
| 101 | + """ |
| 102 | + |
| 103 | + registry = prometheus_client.CollectorRegistry() |
| 104 | + metrics = metrics_factory(registry) |
| 105 | + |
| 106 | + def handle_metrics_message(msg): |
| 107 | + try: |
| 108 | + metric = metrics[msg.name] |
| 109 | + except KeyError: |
| 110 | + logging.error('Recevied message for unknown metric "%s", ignoring', msg.name) |
| 111 | + return |
| 112 | + |
| 113 | + # Apparently, there is no nicer way to access the label names |
| 114 | + if 'service' in metric._labelnames: # pylint: disable=protected-access |
| 115 | + msg.labels['service'] = service |
| 116 | + if len(msg.labels) > 0: |
| 117 | + try: |
| 118 | + metric = metric.labels(**(msg.labels)) |
| 119 | + except ValueError: |
| 120 | + logging.error('Invalid labels specified for metric "%s", ignoring', msg.name) |
| 121 | + return |
| 122 | + |
| 123 | + try: |
| 124 | + bound_method = getattr(metric, msg.instruction) |
| 125 | + except AttributeError: |
| 126 | + logging.error('Cannot use instruction "%s" on metric "%s", ignoring', msg.instruction, msg.name) |
| 127 | + return |
| 128 | + try: |
| 129 | + bound_method(msg.value) |
| 130 | + except: # noqa, pylint: disable=bare-except |
| 131 | + logging.exception('Could not update metric "%s":', msg.name) |
| 132 | + |
| 133 | + def send_metrics_text(): |
| 134 | + metrics_text = prometheus_client.generate_latest(registry) |
| 135 | + pipe_to_server.send(metrics_text) |
| 136 | + |
| 137 | + while True: |
| 138 | + message = in_queue.get(True) |
| 139 | + if isinstance(message, MetricsMessage): |
| 140 | + handle_metrics_message(message) |
| 141 | + elif isinstance(message, HTTPGenMessage): |
| 142 | + send_metrics_text() |
| 143 | + else: |
| 144 | + logging.error('Received unknown message on collector queue') |
| 145 | + |
| 146 | + |
| 147 | +def run_http_server(host, port, queue_to_collector, pipe_from_collector): |
| 148 | + """ |
| 149 | + Runs a server exposing Prometheus metrics via HTTP. The metrics are requested through a HTTPGenMessage |
| 150 | + and received over the pipe. Designed to be run as "target" in a multiprocessing.Process in conjunction |
| 151 | + with run_collector(). |
| 152 | +
|
| 153 | + Args: |
| 154 | + host: Host to run the HTTP server on. |
| 155 | + port: Port to run the HTTP server on. |
| 156 | + queue_to_collector: Queue to which HTTPGenMessages are sent. |
| 157 | + pipe_from_collector: Pipe from which text representations of the metrics are received. |
| 158 | + """ |
| 159 | + |
| 160 | + def app(_, start_response): |
| 161 | + queue_to_collector.put(HTTPGenMessage()) |
| 162 | + output = pipe_from_collector.recv() |
| 163 | + |
| 164 | + status = '200 OK' |
| 165 | + headers = [ |
| 166 | + ('Content-Type', prometheus_client.CONTENT_TYPE_LATEST) |
| 167 | + ] |
| 168 | + start_response(status, headers) |
| 169 | + return [output] |
| 170 | + |
| 171 | + class SilentHandler(simple_server.WSGIRequestHandler): |
| 172 | + def log_message(self, _, *args): |
| 173 | + """ |
| 174 | + Doesn't log anything. |
| 175 | + """ |
| 176 | + |
| 177 | + http_server = simple_server.make_server(host, port, app, handler_class=SilentHandler) |
| 178 | + http_server.serve_forever() |
| 179 | + |
| 180 | + |
| 181 | +class DummyQueue(queue.Queue): |
| 182 | + """ |
| 183 | + Queue that discards all elements put into it. |
| 184 | + """ |
| 185 | + |
| 186 | + def put(self, item, block=True, timeout=None): |
| 187 | + pass |
0 commit comments