Skip to content

Commit a0a3aa7

Browse files
committed
Checker: Collect and provide metrics in Prometheus format
Improves: fausecteam#30
1 parent 57b289c commit a0a3aa7

File tree

8 files changed

+419
-16
lines changed

8 files changed

+419
-16
lines changed

debian/control

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Depends:
3131
python3-django,
3232
python3-markdown,
3333
python3-pil,
34+
python3-prometheus-client,
3435
python3-psycopg2,
3536
python3-tz,
3637
python3-requests,

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
'Django == 1.11.*, >= 1.11.19',
2020
'Markdown',
2121
'Pillow',
22+
'prometheus_client',
2223
'pytz',
2324
'requests',
2425
],

src/ctf_gameserver/checker/master.py

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import datetime
33
import logging
44
import math
5+
import multiprocessing
56
import os
67
import signal
78
import time
@@ -17,7 +18,7 @@
1718
from ctf_gameserver.lib.exceptions import DBDataError
1819
import ctf_gameserver.lib.flag as flag_lib
1920

20-
from . import database
21+
from . import database, metrics
2122
from .supervisor import RunnerSupervisor
2223
from .supervisor import ACTION_FLAG, ACTION_LOAD, ACTION_STORE, ACTION_RESULT
2324

@@ -29,6 +30,8 @@ def main():
2930
help='(Old-style) Python formatstring for building the IP to connect to')
3031
arg_parser.add_argument('--flagsecret', type=str, required=True,
3132
help='Base64 string used as secret in flag generation')
33+
arg_parser.add_argument('--metrics-listen', type=str, help='Expose Prometheus metrics via HTTP '
34+
'("<host>:<port>")')
3235

3336
group = arg_parser.add_argument_group('statedb', 'Checker state database')
3437
group.add_argument('--statedbhost', type=str, help='Hostname of the database. If unspecified, the '
@@ -98,6 +101,33 @@ def main():
98101
return os.EX_USAGE
99102
logging_params['gelf'] = {'host': gelf_host, 'port': gelf_port}
100103

104+
# Configure metrics
105+
if args.metrics_listen is not None:
106+
metrics_listen = urllib.parse.urlsplit('//' + args.metrics_listen)
107+
metrics_host = metrics_listen.hostname
108+
metrics_port = metrics_listen.port
109+
if metrics_host is None or metrics_port is None:
110+
logging.error('Metrics listen address needs to be specified as "<host>:<port>"')
111+
return os.EX_USAGE
112+
113+
metrics_queue = multiprocessing.Queue()
114+
metrics_recv, metrics_send = multiprocessing.Pipe()
115+
metrics_collector_process = multiprocessing.Process(
116+
target=metrics.run_collector,
117+
args=(args.service, metrics.checker_metrics_factory, metrics_queue, metrics_send)
118+
)
119+
metrics_collector_process.start()
120+
metrics_server_process = multiprocessing.Process(
121+
target=metrics.run_http_server,
122+
args=(metrics_host, metrics_port, metrics_queue, metrics_recv)
123+
)
124+
metrics_server_process.start()
125+
126+
metrics.set(metrics_queue, 'interval_length_seconds', args.interval)
127+
metrics.set(metrics_queue, 'start_timestamp', time.time())
128+
else:
129+
metrics_queue = metrics.DummyQueue()
130+
101131
flag_secret = base64.b64decode(args.flagsecret)
102132

103133
# Connect to databases
@@ -159,7 +189,7 @@ def main():
159189
try:
160190
master_loop = MasterLoop(game_db_conn, state_db_conn, args.service, args.checkerscript,
161191
args.sudouser, args.stddeviations, args.checkercount, args.interval,
162-
args.ippattern, flag_secret, logging_params)
192+
args.ippattern, flag_secret, logging_params, metrics_queue)
163193
break
164194
except DBDataError as e:
165195
logging.warning('Waiting for valid database state: %s', e)
@@ -177,13 +207,18 @@ def sigterm_handler(_, __):
177207
if master_loop.shutting_down and master_loop.get_running_script_count() == 0:
178208
break
179209

210+
metrics_server_process.terminate()
211+
metrics_collector_process.terminate()
212+
metrics_server_process.join()
213+
metrics_collector_process.join()
214+
180215
return os.EX_OK
181216

182217

183218
class MasterLoop:
184219

185220
def __init__(self, game_db_conn, state_db_conn, service_slug, checker_script, sudo_user, std_dev_count,
186-
checker_count, interval, ip_pattern, flag_secret, logging_params):
221+
checker_count, interval, ip_pattern, flag_secret, logging_params, metrics_queue):
187222
self.game_db_conn = game_db_conn
188223
self.state_db_conn = state_db_conn
189224
self.checker_script = checker_script
@@ -194,12 +229,13 @@ def __init__(self, game_db_conn, state_db_conn, service_slug, checker_script, su
194229
self.ip_pattern = ip_pattern
195230
self.flag_secret = flag_secret
196231
self.logging_params = logging_params
232+
self.metrics_queue = metrics_queue
197233

198234
self.refresh_control_info()
199235
self.service = database.get_service_attributes(self.game_db_conn, service_slug)
200236
self.service['slug'] = service_slug
201237

202-
self.supervisor = RunnerSupervisor()
238+
self.supervisor = RunnerSupervisor(metrics_queue)
203239
self.known_tick = -1
204240
# Trigger launch of tasks in first step()
205241
self.last_launch = get_monotonic_time() - self.interval
@@ -254,6 +290,10 @@ def step(self):
254290
if not self.shutting_down:
255291
# Launch new tasks and catch up missed intervals
256292
while get_monotonic_time() - self.last_launch >= self.interval:
293+
delay = get_monotonic_time() - self.last_launch - self.interval
294+
metrics.observe(self.metrics_queue, 'task_launch_delay_seconds', delay)
295+
metrics.set(self.metrics_queue, 'last_launch_timestamp', time.time())
296+
257297
self.last_launch += self.interval
258298
self.launch_tasks()
259299

@@ -304,6 +344,7 @@ def handle_result_request(self, task_info, param):
304344

305345
logging.info('Result from Checker Script for team %d (net number %d) in tick %d: %s',
306346
task_info['_team_id'], task_info['team'], task_info['tick'], check_result)
347+
metrics.inc(self.metrics_queue, 'completed_tasks', labels={'result': check_result.name})
307348
database.commit_result(self.game_db_conn, self.service['id'], task_info['team'], task_info['tick'],
308349
result)
309350

@@ -372,6 +413,8 @@ def update_launch_params(self, tick):
372413
self.tasks_per_launch = math.ceil(local_tasks / intervals_per_timeframe)
373414
logging.info('Planning to start %d tasks per interval with a maximum duration of %d seconds',
374415
self.tasks_per_launch, check_duration)
416+
metrics.set(self.metrics_queue, 'tasks_per_launch_count', self.tasks_per_launch)
417+
metrics.set(self.metrics_queue, 'max_task_duration_seconds', check_duration)
375418

376419
def get_running_script_count(self):
377420
return len(self.supervisor.processes)

src/ctf_gameserver/checker/metrics.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import logging
2+
import queue
3+
from wsgiref import simple_server
4+
5+
import prometheus_client
6+
7+
8+
def inc(metrics_queue, name, value=1, labels=None):
9+
10+
metrics_queue.put(MetricsMessage(name, 'inc', value, labels))
11+
12+
13+
def dec(metrics_queue, name, value=1, labels=None):
14+
15+
metrics_queue.put(MetricsMessage(name, 'dec', value, labels))
16+
17+
18+
def set(metrics_queue, name, value, labels=None): # pylint: disable=redefined-builtin
19+
20+
metrics_queue.put(MetricsMessage(name, 'set', value, labels))
21+
22+
23+
def observe(metrics_queue, name, value, labels=None):
24+
25+
metrics_queue.put(MetricsMessage(name, 'observe', value, labels))
26+
27+
28+
class MetricsMessage:
29+
"""
30+
Message to put into run_collector()'s queue for recording metric changes.
31+
"""
32+
33+
def __init__(self, name, instruction, value, labels=None):
34+
self.name = name
35+
self.instruction = instruction
36+
self.value = value
37+
38+
if labels is None:
39+
self.labels = {}
40+
else:
41+
self.labels = labels
42+
43+
44+
class HTTPGenMessage:
45+
"""
46+
Message to put into run_collector()'s queue for receiving a text representation of its metrics (for HTTP
47+
export) through its pipe.
48+
"""
49+
50+
51+
def checker_metrics_factory(registry):
52+
53+
metrics = {}
54+
metric_prefix = 'ctf_checkermaster_'
55+
56+
counters = [
57+
('started_tasks', 'Number of started Checker Script instances', []),
58+
('completed_tasks', 'Number of successfully completed checks', ['result']),
59+
('terminated_tasks', 'Number of Checker Script instances forcibly terminated', [])
60+
]
61+
for name, doc, labels in counters:
62+
metrics[name] = prometheus_client.Counter(metric_prefix+name, doc, labels+['service'],
63+
registry=registry)
64+
65+
gauges = [
66+
('start_timestamp', '(Unix timestamp when the process was started', []),
67+
('interval_length_seconds', 'Configured launch interval length', []),
68+
('last_launch_timestamp', '(Unix) timestamp when tasks were launched the last time', []),
69+
('tasks_per_launch_count', 'Number of checks to start in one launch interval', []),
70+
('max_task_duration_seconds', 'Currently estimated maximum runtime of one check', [])
71+
]
72+
for name, doc, labels in gauges:
73+
metrics[name] = prometheus_client.Gauge(metric_prefix+name, doc, labels+['service'],
74+
registry=registry)
75+
76+
histograms = [
77+
('task_launch_delay_seconds', 'Differences between supposed and actual task launch times', [],
78+
(0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 10, 30, 60, float('inf'))),
79+
('script_duration_seconds', 'Observed runtimes of Checker Scripts', [],
80+
(1, 3, 5, 8, 10, 20, 30, 45, 60, 90, 120, 150, 180, 240, 300, float('inf')))
81+
]
82+
for name, doc, labels, buckets in histograms:
83+
metrics[name] = prometheus_client.Histogram(metric_prefix+name, doc, labels+['service'],
84+
buckets=buckets, registry=registry)
85+
86+
return metrics
87+
88+
89+
def run_collector(service, metrics_factory, in_queue, pipe_to_server):
90+
"""
91+
Manages Prometheus metrics. Receives changes to the metrics through a queue and emits their text
92+
representation (for HTTP export) over a pipe. Designed to be run as "target" in a multiprocessing.Process
93+
in conjunction with run_http_server().
94+
95+
Args:
96+
service: Slug of this checker instance's service.
97+
metrics_factory: Callable returning a dict of the mtrics to use mapping from name to Metric object.
98+
in_queue: Queue over which MetricsMessages and HTTPGenMessages are received.
99+
pipe_to_server: Pipe to which text representations of the metrics are sent in response to
100+
HTTPGenMessages.
101+
"""
102+
103+
registry = prometheus_client.CollectorRegistry()
104+
metrics = metrics_factory(registry)
105+
106+
def handle_metrics_message(msg):
107+
try:
108+
metric = metrics[msg.name]
109+
except KeyError:
110+
logging.error('Recevied message for unknown metric "%s", ignoring', msg.name)
111+
return
112+
113+
# Apparently, there is no nicer way to access the label names
114+
if 'service' in metric._labelnames: # pylint: disable=protected-access
115+
msg.labels['service'] = service
116+
if len(msg.labels) > 0:
117+
try:
118+
metric = metric.labels(**(msg.labels))
119+
except ValueError:
120+
logging.error('Invalid labels specified for metric "%s", ignoring', msg.name)
121+
return
122+
123+
try:
124+
bound_method = getattr(metric, msg.instruction)
125+
except AttributeError:
126+
logging.error('Cannot use instruction "%s" on metric "%s", ignoring', msg.instruction, msg.name)
127+
return
128+
try:
129+
bound_method(msg.value)
130+
except: # noqa, pylint: disable=bare-except
131+
logging.exception('Could not update metric "%s":', msg.name)
132+
133+
def send_metrics_text():
134+
metrics_text = prometheus_client.generate_latest(registry)
135+
pipe_to_server.send(metrics_text)
136+
137+
while True:
138+
message = in_queue.get(True)
139+
if isinstance(message, MetricsMessage):
140+
handle_metrics_message(message)
141+
elif isinstance(message, HTTPGenMessage):
142+
send_metrics_text()
143+
else:
144+
logging.error('Received unknown message on collector queue')
145+
146+
147+
def run_http_server(host, port, queue_to_collector, pipe_from_collector):
148+
"""
149+
Runs a server exposing Prometheus metrics via HTTP. The metrics are requested through a HTTPGenMessage
150+
and received over the pipe. Designed to be run as "target" in a multiprocessing.Process in conjunction
151+
with run_collector().
152+
153+
Args:
154+
host: Host to run the HTTP server on.
155+
port: Port to run the HTTP server on.
156+
queue_to_collector: Queue to which HTTPGenMessages are sent.
157+
pipe_from_collector: Pipe from which text representations of the metrics are received.
158+
"""
159+
160+
def app(_, start_response):
161+
queue_to_collector.put(HTTPGenMessage())
162+
output = pipe_from_collector.recv()
163+
164+
status = '200 OK'
165+
headers = [
166+
('Content-Type', prometheus_client.CONTENT_TYPE_LATEST)
167+
]
168+
start_response(status, headers)
169+
return [output]
170+
171+
class SilentHandler(simple_server.WSGIRequestHandler):
172+
def log_message(self, _, *args):
173+
"""
174+
Doesn't log anything.
175+
"""
176+
177+
http_server = simple_server.make_server(host, port, app, handler_class=SilentHandler)
178+
http_server.serve_forever()
179+
180+
181+
class DummyQueue(queue.Queue):
182+
"""
183+
Queue that discards all elements put into it.
184+
"""
185+
186+
def put(self, item, block=True, timeout=None):
187+
pass

src/ctf_gameserver/checker/supervisor.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111

1212
from ctf_gameserver.lib.checkresult import CheckResult
1313

14+
from . import metrics
15+
1416

1517
ACTION_FLAG = 'FLAG'
1618
ACTION_LOAD = 'LOAD'
@@ -34,14 +36,17 @@ class RunnerSupervisor:
3436
Launches Checker Script Runners as individual processes and takes care of communicating with them.
3537
"""
3638

37-
def __init__(self):
39+
def __init__(self, metrics_queue):
40+
self.metrics_queue = metrics_queue
41+
3842
# Timeout if there are no requests when all Runners are done or blocking
3943
self.queue_timeout = 1
4044
self._reset()
4145

4246
def _reset(self):
4347
self.work_queue = multiprocessing.Queue()
4448
self.processes = {}
49+
self.start_times = {}
4550
self.next_identifier = 0
4651

4752
def start_runner(self, args, sudo_user, info, logging_params):
@@ -51,14 +56,17 @@ def start_runner(self, args, sudo_user, info, logging_params):
5156
logging_params, self.next_identifier,
5257
self.work_queue, receive))
5358
self.processes[self.next_identifier] = (proc, send, info)
59+
self.start_times[self.next_identifier] = time.monotonic()
5460

5561
proc.start()
5662
self.next_identifier += 1
63+
metrics.inc(self.metrics_queue, 'started_tasks')
5764

5865
def terminate_runner(self, runner_id):
5966
logging.info('Terminating Runner process, info: %s', self.processes[runner_id][2])
6067
self.processes[runner_id][0].terminate()
6168
# Afterwards, get_request() will join the child and remove it from `self.processes`
69+
metrics.inc(self.metrics_queue, 'terminated_tasks')
6270

6371
def terminate_runners(self):
6472
if len(self.processes) > 0:
@@ -82,9 +90,14 @@ def get_request(self):
8290

8391
# Join all terminated child processes
8492
if action == ACTION_RUNNER_EXIT:
93+
duration = time.monotonic() - self.start_times[runner_id]
94+
metrics.observe(self.metrics_queue, 'script_duration_seconds', duration)
95+
del self.start_times[runner_id]
96+
8597
proc = self.processes[runner_id][0]
8698
proc.join()
8799
del self.processes[runner_id]
100+
88101
if self.work_queue.empty():
89102
return None
90103
else:

0 commit comments

Comments
 (0)