Skip to content

Commit 6e7f2b1

Browse files
committed
Add metrics on the internal-workers
1 parent 48a157e commit 6e7f2b1

File tree

4 files changed

+77
-9
lines changed

4 files changed

+77
-9
lines changed

service/domain_worker.ml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type request = {
1111
cancelled : unit Eio.Promise.t option;
1212
}
1313

14-
type reply = ((OpamPackage.t list, string) result * float, [`Msg of string]) result
14+
type reply = ((OpamPackage.t list, string) result * float, [`Msg of string | `Cancelled]) result
1515

1616
let env (vars : Worker.Vars.t) v =
1717
match v with
@@ -31,7 +31,7 @@ let env (vars : Worker.Vars.t) v =
3131

3232
let solve { packages; root_pkgs; pinned_pkgs; vars; cancelled } =
3333
match cancelled with
34-
| Some p when Promise.is_resolved p -> Error (`Msg "Cancelled")
34+
| Some p when Promise.is_resolved p -> Error `Cancelled
3535
| _ ->
3636
try
3737
let pins = root_pkgs @ pinned_pkgs |> OpamPackage.Name.Map.of_list in

service/domain_worker.mli

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ type request = {
1515
(** If resolved, the result is not needed. *)
1616
}
1717

18-
type reply = ((OpamPackage.t list, string) result * float, [`Msg of string]) result
18+
type reply = ((OpamPackage.t list, string) result * float, [`Msg of string | `Cancelled]) result
1919
(** [Ok (Ok selection)] if there is a solution.
2020
[Ok (Error msg)] if there is no solution.
21-
[Error msg] if the request was invalid. *)
21+
[Error msg] if the request was invalid.
22+
[Error Cancelled] if the request was cancelled before started. *)
2223

2324
val env : Solver_service_api.Worker.Vars.t -> string -> OpamVariable.variable_contents option
2425
(** [env vars name] is the value of [name] in [vars]. *)

service/pool.ml

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
open Eio.Std
22

3-
type ('request, 'reply) t = ('request * 'reply Promise.u) Eio.Stream.t
3+
type ('request, 'reply) t = {
4+
requests: ('request * 'reply Promise.u) Eio.Stream.t; running: int Atomic.t; n_workers: int
5+
}
46

57
let rec run_worker t handle =
6-
let request, set_reply = Eio.Stream.take t in
8+
let request, set_reply = Eio.Stream.take t.requests in
9+
Atomic.incr t.running;
710
handle request |> Promise.resolve set_reply;
11+
Atomic.decr t.running;
812
run_worker t handle
913

1014
let create ~sw ~domain_mgr ~n_workers handle =
11-
let t = Eio.Stream.create 0 in
15+
let t = { requests = Eio.Stream.create 0; running = Atomic.make 0; n_workers } in
1216
for _i = 1 to n_workers do
1317
Fiber.fork_daemon ~sw (fun () ->
1418
Eio.Domain_manager.run domain_mgr (fun () -> run_worker t handle)
@@ -18,5 +22,9 @@ let create ~sw ~domain_mgr ~n_workers handle =
1822

1923
let use t request =
2024
let reply, set_reply = Promise.create () in
21-
Eio.Stream.add t (request, set_reply);
25+
Eio.Stream.add t.requests (request, set_reply);
2226
Promise.await reply
27+
28+
let running_workers t = Atomic.get t.running
29+
30+
let n_workers t = t.n_workers

service/solver.ml

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,49 @@ type t = {
1212

1313
let ocaml = OpamPackage.Name.of_string "ocaml"
1414

15+
module Metrics = struct
16+
open Prometheus
17+
18+
let namespace = "ocluster"
19+
let subsystem = "worker"
20+
21+
let request_handling_total =
22+
let help = "Total number of handled solve requests" in
23+
Counter.v ~help ~namespace ~subsystem "requests_handled_total"
24+
25+
let request_handling =
26+
let help = "Number of handled requests by state" in
27+
Gauge.v_label ~label_name:"state" ~help ~namespace ~subsystem "solve_request_state"
28+
29+
let update_request_handling pool =
30+
let running = Pool.running_workers pool in
31+
let waiting = (Pool.n_workers pool) - running in
32+
Gauge.set (request_handling "running") (float_of_int running);
33+
Gauge.set (request_handling "waiting") (float_of_int waiting)
34+
35+
36+
let request_ok =
37+
let help = "Total number of success solve requests" in
38+
Counter.v ~help ~namespace ~subsystem "success_solve"
39+
40+
let request_fail =
41+
let help = "Total number of fail solve requests" in
42+
Counter.v ~help ~namespace ~subsystem "fail_solve"
43+
44+
let request_no_solution =
45+
let help = "Total number of no solution solve requests " in
46+
Counter.v ~help ~namespace ~subsystem "no_solution_solve"
47+
48+
let request_cancelled =
49+
let help = "Total number of cancel without running solve requests" in
50+
Counter.v ~help ~namespace ~subsystem "cancel_without_running_solve"
51+
52+
let request_cancelled_after =
53+
let help = "Total number of cancel when running solve requests" in
54+
Counter.v ~help ~namespace ~subsystem "cancel_when_running_solve"
55+
56+
end
57+
1558
(* If a local package has a literal constraint on OCaml's version and it doesn't match
1659
the platform, we just remove that package from the set to test, so other packages
1760
can still be tested. *)
@@ -47,6 +90,7 @@ let solve_for_platform ?cancelled t ~log ~opam_repository_commits ~packages ~roo
4790
) else (
4891
let slice = { Domain_worker.vars; root_pkgs; packages; pinned_pkgs; cancelled } in
4992
match Pool.use t.pool slice with
93+
| Error `Cancelled -> Error `Cancelled
5094
| Error (`Msg m) -> Error (`Msg m)
5195
| Ok (results, time) ->
5296
match results with
@@ -115,12 +159,15 @@ let solve ?cancelled t ~log request =
115159
in
116160
Log.info log "Solving for %a" Fmt.(list ~sep:comma string) root_pkgs;
117161
let serious_errors = ref [] in
162+
let cancels_without_running = ref 0 in
118163
let*! root_pkgs = parse_opams request.root_pkgs in
119164
let*! pinned_pkgs = parse_opams request.pinned_pkgs in
120165
let*! packages = Stores.packages t.stores opam_repository_commits in
121166
let results =
122167
platforms
123168
|> Fiber.List.map (fun (id, vars) ->
169+
Prometheus.Counter.inc_one Metrics.request_handling_total;
170+
Metrics.update_request_handling t.pool;
124171
let result =
125172
solve_for_platform t id
126173
?cancelled
@@ -132,30 +179,42 @@ let solve ?cancelled t ~log request =
132179
~pins
133180
~vars
134181
in
182+
Metrics.update_request_handling t.pool;
135183
(id, result)
136184
)
137185
|> List.filter_map (fun (id, result) ->
138186
Log.info log "= %s =" id;
139187
match result with
140188
| Ok result ->
189+
Prometheus.Counter.inc_one Metrics.request_ok;
141190
Log.info log "-> @[<hov>%a@]"
142191
Fmt.(list ~sep:sp string)
143192
result.Selection.packages;
144193
Log.info log "(valid since opam-repository commit(s): @[%a@])"
145194
Fmt.(list ~sep:semi (pair ~sep:comma string string))
146195
result.Selection.commits;
147196
Some result
197+
| Error `Cancelled ->
198+
Prometheus.Counter.inc_one Metrics.request_cancelled;
199+
incr cancels_without_running;
200+
Log.info log "%s" "Cancelled";
201+
None
148202
| Error (`No_solution msg) ->
203+
Prometheus.Counter.inc_one Metrics.request_no_solution;
149204
Log.info log "%s" msg;
150205
None
151206
| Error (`Msg msg) ->
207+
Prometheus.Counter.inc_one Metrics.request_fail;
152208
Log.info log "%s" msg;
153209
serious_errors := msg :: !serious_errors;
154210
None
155211
)
156212
in
157213
match cancelled with
158-
| Some p when Promise.is_resolved p -> Error `Cancelled
214+
| Some p when Promise.is_resolved p ->
215+
let cancels = (List.length platforms) - (!cancels_without_running) in
216+
Prometheus.Counter.inc Metrics.request_cancelled_after (float_of_int cancels);
217+
Error `Cancelled
159218
| _ ->
160219
match !serious_errors with
161220
| [] -> Ok results

0 commit comments

Comments
 (0)