Skip to content

Commit 4016c53

Browse files
committed
Add metrics about the solver-service behaviour
1 parent 3560af8 commit 4016c53

File tree

2 files changed

+62
-19
lines changed

2 files changed

+62
-19
lines changed

service/service.ml

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,45 @@ module Selection = Worker.Selection
66
module Store = Git_unix.Store
77
module Worker_process = Internal_worker.Worker_process
88

9+
module Metrics = struct
10+
open Prometheus
11+
12+
let namespace = "ocluster"
13+
let subsystem = "worker"
14+
15+
let in_use_solver_process =
16+
let help = "Number of solver-process currently used for solving" in
17+
Gauge.v ~help ~namespace ~subsystem "in_use_solver_process"
18+
19+
let wait_queue_solver_process_pool =
20+
let help = "Number of request in the queue of the solver-process pool" in
21+
Gauge.v ~help ~namespace ~subsystem "wait_queue_requests"
22+
23+
let started_requests =
24+
let help = "Number of requests sent on solver-process pool" in
25+
Counter.v ~help ~namespace ~subsystem "started_requests_total"
26+
27+
let success_requests =
28+
let help = "Number of solver_process requests success" in
29+
Counter.v ~help ~namespace ~subsystem "success_requests_total"
30+
31+
let failed_requests =
32+
let help = "Number of solver_process requests failed" in
33+
Counter.v ~help ~namespace ~subsystem "failed_requests_total"
34+
35+
let canceled_before_started =
36+
let help = "Number of solver_process requests canceled before started" in
37+
Counter.v ~help ~namespace ~subsystem "canceled_before_started_total"
38+
39+
let canceled_after_started =
40+
let help = "Number of solver_process requests canceled after started" in
41+
Counter.v ~help ~namespace ~subsystem "canceled_after_started_total"
42+
end
43+
44+
let update_wait_queue t =
45+
Prometheus.Gauge.set Metrics.wait_queue_solver_process_pool
46+
(float_of_int (Lwt_pool.wait_queue_length t))
47+
948
let oldest_commit = Lwt_pool.create 180 @@ fun _ -> Lwt.return_unit
1049
(* we are using at most 360 pipes at the same time and that's enough to keep the current
1150
* performance and prevent some jobs to fail because of file descriptors exceed the limit.*)
@@ -91,7 +130,10 @@ module Make (Opam_repo : Opam_repository_intf.S) = struct
91130

92131
(* Send [request] to [worker] and read the reply. *)
93132
let process ~switch ~log ~id request worker =
94-
if not (Lwt_switch.is_on switch) then Lwt.fail Lwt.Canceled
133+
Prometheus.Gauge.inc_one Metrics.in_use_solver_process;
134+
if not (Lwt_switch.is_on switch) then (
135+
Prometheus.Counter.inc_one Metrics.canceled_before_started;
136+
Lwt.fail Lwt.Canceled)
95137
else
96138
let request_str =
97139
Worker.Solve_request.to_yojson request |> Yojson.Safe.to_string
@@ -131,6 +173,7 @@ module Make (Opam_repo : Opam_repository_intf.S) = struct
131173
* workers's pool choosing the worker for another processing.*)
132174
if Lwt.state process = Lwt.Sleep then (
133175
Worker_process.release worker;
176+
Prometheus.Counter.inc_one Metrics.canceled_after_started;
134177
Lwt.cancel process;
135178
dispose worker)
136179
else Lwt.return_unit )
@@ -193,9 +236,25 @@ module Make (Opam_repo : Opam_repository_intf.S) = struct
193236
else compatible_root_pkgs
194237
in
195238
let slice = { request with platforms = [ p ]; root_pkgs } in
196-
Lwt_pool.use t (process ~switch ~log ~id slice) >>= function
197-
| Error _ as e -> Lwt.return (id, e)
239+
Lwt.catch
240+
(fun () ->
241+
Prometheus.Counter.inc_one Metrics.started_requests;
242+
update_wait_queue t;
243+
Lwt_pool.use t (process ~switch ~log ~id slice))
244+
(function
245+
| Lwt.Canceled ->
246+
Prometheus.Gauge.dec_one Metrics.in_use_solver_process;
247+
update_wait_queue t;
248+
Lwt.fail Lwt.Canceled
249+
| ex -> raise ex)
250+
>>= fun result ->
251+
Prometheus.Gauge.dec_one Metrics.in_use_solver_process;
252+
match result with
253+
| Error _ as e ->
254+
Prometheus.Counter.inc_one Metrics.failed_requests;
255+
Lwt.return (id, e)
198256
| Ok packages ->
257+
let _ = Prometheus.Counter.inc_one Metrics.success_requests in
199258
let repo_packages =
200259
packages
201260
|> List.filter_map (fun pkg ->

worker/worker.ml

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,9 @@ module Metrics = struct
2424
Summary.v_label ~label_name:"result" ~help ~namespace ~subsystem
2525
"job_time_seconds"
2626

27-
let docker_push_time =
28-
let help = "Time uploading to Docker Hub" in
29-
Summary.v ~help ~namespace ~subsystem "docker_push_time_seconds"
30-
31-
let docker_prune_time =
32-
let help = "Time spent pruning Docker cache" in
33-
Summary.v ~help ~namespace ~subsystem "docker_prune_time_seconds"
34-
3527
let running_jobs =
3628
let help = "Number of jobs currently running" in
3729
Gauge.v ~help ~namespace ~subsystem "running_jobs"
38-
39-
let healthcheck_time =
40-
let help = "Time to perform last healthcheck" in
41-
Gauge.v ~help ~namespace ~subsystem "healthcheck_time_seconds"
42-
43-
let unhealthy =
44-
let help = "Number of unhealthy workers" in
45-
Gauge.v ~help ~namespace ~subsystem "unhealthy"
4630
end
4731

4832
type build =

0 commit comments

Comments
 (0)