Skip to content

Commit

Permalink
feat(prometheus): expose controlplane connectivity state as a gauge
Browse files Browse the repository at this point in the history
Add a new Prometheus gauge metric `control_plane_reachable`. Similar to
`datastore_reachable` gauge, 0 means the connection is not healthy; 1
means that the connection is healthy. We mark the connection as
unhealthy under the following circumstances:
* Failure while establihing a websocket connection
* Failure while sending basic information to controlplane
* Failure while sending ping to controlplane
* Failure while receiving a packet from the websocket connection

This is helpful for users running a signficant number of gateways to be
alerted about potential issues any gateway(s) may be facing while
talking to the controlplane.

Signed-off-by: Sanskar Jaiswal <[email protected]>
  • Loading branch information
aryan9600 committed Dec 19, 2024
1 parent f7f7a23 commit 4fbf17f
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 3 deletions.
25 changes: 22 additions & 3 deletions kong/clustering/data_plane.lua
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,20 @@ function _M.new(clustering)
end


local function set_control_plane_reachable(reachable)
local ok, err = ngx.shared.kong:safe_set("control_plane_reachable", reachable)
if not ok then
ngx_log(ngx_ERR, _log_prefix, "failed to set controlplane_reachable key in shm to " .. reachable .. " :", err)
end
end


function _M:init_worker(basic_info)
-- ROLE = "data_plane"

self.plugins_list = basic_info.plugins
self.filters = basic_info.filters
set_control_plane_reachable(false)

-- only run in process which worker_id() == 0
assert(ngx.timer.at(0, function(premature)
Expand All @@ -98,13 +107,17 @@ local function send_ping(c, log_suffix)

local _, err = c:send_ping(hash)
if err then
set_control_plane_reachable(false)
ngx_log(is_timeout(err) and ngx_NOTICE or ngx_WARN, _log_prefix,
"unable to send ping frame to control plane: ", err, log_suffix)

-- only log a ping if the hash changed
elseif hash ~= prev_hash then
prev_hash = hash
ngx_log(ngx_INFO, _log_prefix, "sent ping frame to control plane with hash: ", hash, log_suffix)
else
set_control_plane_reachable(true)
if hash ~= prev_hash then
prev_hash = hash
ngx_log(ngx_INFO, _log_prefix, "sent ping frame to control plane with hash: ", hash, log_suffix)
end
end
end

Expand Down Expand Up @@ -156,6 +169,7 @@ function _M:communicate(premature)

local c, uri, err = clustering_utils.connect_cp(self, "/v1/outlet")
if not c then
set_control_plane_reachable(false)
ngx_log(ngx_WARN, _log_prefix, "connection to control plane ", uri, " broken: ", err,
" (retrying after ", reconnection_delay, " seconds)", log_suffix)

Expand Down Expand Up @@ -188,6 +202,7 @@ function _M:communicate(premature)
filters = self.filters,
labels = labels, }))
if err then
set_control_plane_reachable(false)
ngx_log(ngx_ERR, _log_prefix, "unable to send basic information to control plane: ", uri,
" err: ", err, " (retrying after ", reconnection_delay, " seconds)", log_suffix)

Expand All @@ -197,6 +212,7 @@ function _M:communicate(premature)
end))
return
end
set_control_plane_reachable(true)

local config_semaphore = semaphore.new(0)

Expand Down Expand Up @@ -302,16 +318,19 @@ function _M:communicate(premature)
local data, typ, err = c:recv_frame()
if err then
if not is_timeout(err) then
set_control_plane_reachable(false)
return nil, "error while receiving frame from control plane: " .. err
end

local waited = ngx_time() - last_seen
if waited > PING_WAIT then
set_control_plane_reachable(false)
return nil, "did not receive pong frame from control plane within " .. PING_WAIT .. " seconds"
end

goto continue
end
set_control_plane_reachable(true)

if typ == "close" then
ngx_log(ngx_DEBUG, _log_prefix, "received close frame from control plane", log_suffix)
Expand Down
13 changes: 13 additions & 0 deletions kong/plugins/prometheus/exporter.lua
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ local lower = string.lower
local ngx_timer_pending_count = ngx.timer.pending_count
local ngx_timer_running_count = ngx.timer.running_count
local get_all_upstreams = balancer.get_all_upstreams

if not balancer.get_all_upstreams then -- API changed since after Kong 2.5
get_all_upstreams = require("kong.runloop.balancer.upstreams").get_all_upstreams
end
Expand Down Expand Up @@ -65,6 +66,11 @@ local function init()
"0 is unreachable",
nil,
prometheus.LOCAL_STORAGE)
metrics.cp_reachable = prometheus:gauge("control_plane_reachable",
"Control plane reachable from gateway, " ..
"0 is unreachable",
nil,
prometheus.LOCAL_STORAGE)
metrics.node_info = prometheus:gauge("node_info",
"Kong Node metadata information",
{"node_id", "version"},
Expand Down Expand Up @@ -449,6 +455,13 @@ local function metric_data(write_fn)
kong.log.err("prometheus: failed to reach database while processing",
"/metrics endpoint: ", err)
end

local cp_reachable = ngx.shared.kong:get("control_plane_reachable")
if cp_reachable then
metrics.cp_reachable:set(1)
else
metrics.cp_reachable:set(0)
end
end

local phase = get_phase()
Expand Down

0 comments on commit 4fbf17f

Please sign in to comment.