Skip to content

Commit

Permalink
Disable on disk failure
Browse files Browse the repository at this point in the history
  • Loading branch information
yngvar-antonsson committed Apr 22, 2024
1 parent a996de4 commit 8dca1df
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 32 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ and this project adheres to
Unreleased
-------------------------------------------------------------------------------

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Added
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

- Disk failure check. If there is something wrong with the disk, the instance
will be disabled automatically and the corresponding issue will be shown
in the WebUI.

-------------------------------------------------------------------------------
[2.10.0] - 2024-04-10
-------------------------------------------------------------------------------
Expand Down
23 changes: 16 additions & 7 deletions cartridge/issues.lua
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@
--
-- * critical: "All instances are unhealthy in replicaset ... ".
--
-- Disk failures:
--
-- * critical: "Disk error on instance ... ".
--
-- Custom issues (defined by user):
--
-- * Custom roles can announce more issues with their own level, topic
Expand Down Expand Up @@ -520,15 +524,14 @@ local function list_on_instance(opts)
if type(box.cfg) == 'table' and not fio.lstat(box.cfg.memtx_dir) then
table.insert(ret, {
level = 'critical',
topic = 'disk_error',
topic = 'disk_failure',
instance_uuid = instance_uuid,
replicaset_uuid = replicaset_uuid,
message = string.format(
'Disk error on instance %s',
'Disk error on instance %s. This issue stays until restart',
instance_uuid
),
})
vshard.storage.disable()
end

-- add custom issues from each role
Expand All @@ -552,6 +555,7 @@ local function list_on_instance(opts)
return ret
end

local disk_failure_cache = {}
local function list_on_cluster()
local state, err = confapplier.get_state()
if state == 'Unconfigured' and lua_api_proxy.can_call() then
Expand Down Expand Up @@ -720,16 +724,21 @@ local function list_on_cluster()
{uri_list = uri_list, timeout = 1}
)

local disk_error_uuids = {}
local disk_failure_uuids = {}
for _, issues in pairs(issues_map) do
for _, issue in pairs(issues) do
table.insert(ret, issue)
if issue.topic == 'disk_error' then
table.insert(disk_error_uuids, issue.instance_uuid)
if issue.topic == 'disk_failure' then
table.insert(disk_failure_uuids, issue.instance_uuid)
disk_failure_cache[issue.instance_uuid] = issue
end
end
end
lua_api_topology.disable_servers(disk_error_uuids)
for _, issue in pairs(disk_failure_cache) do
table.insert(ret, issue)
end

lua_api_topology.disable_servers(disk_failure_uuids)

-- to use this counter in tarantool/metrics
rawset(_G, '__cartridge_issues_cnt', #ret)
Expand Down
88 changes: 63 additions & 25 deletions cartridge/lua-api/topology.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ local lua_api_edit_topology = require('cartridge.lua-api.edit-topology')

local ProbeServerError = errors.new_class('ProbeServerError')
local RestartReplicationError = errors.new_class('RestartReplicationError')
local RefineUriError = errors.new_class('RefineUriError')
local DisableServerError = errors.new_class('DisableServerError')


--- Get alias, uri and uuid of current instance.
-- @function get_self
Expand Down Expand Up @@ -105,8 +108,65 @@ local function probe_server(uri)
return true
end

local function get_uri_list(uuids, only_enabled)
if only_enabled == nil then
only_enabled = true
end
local topology_cfg = confapplier.get_readonly('topology')

if topology_cfg == nil then
return nil, RefineUriError:new(
"Current instance isn't bootstrapped yet"
)
end

local uri_list = {}
local refined_uri_list = topology.refine_servers_uri(topology_cfg)
for _, uuid in ipairs(uuids) do
local srv = topology_cfg.servers[uuid]
if not srv then
return nil, RefineUriError:new(
'Server %s not in clusterwide config', uuid
)
elseif only_enabled and topology.disabled(uuid, srv) then
return nil, RefineUriError:new(
'Server %s is disabled', uuid
)
end

table.insert(uri_list, refined_uri_list[uuid])
end
return uri_list
end


local function __cartridge_set_vshard_disabled_state(state)
local vshard = rawget(_G, 'vshard')
if vshard == nil then
return
end
if state == true and vshard.storage.internal.is_enabled then
vshard.storage.disable()
elseif state == false and not vshard.storage.internal.is_enabled then
vshard.storage.enable()
end
end
rawset(_G, '__cartridge_set_vshard_disabled_state', __cartridge_set_vshard_disabled_state)

local function __set_servers_disabled_state(uuids, state)
checks('table', 'boolean')

local uri_list, err = get_uri_list(uuids, state)
if uri_list == nil then
return nil, err
end

pool.map_call(
'_G.__cartridge_set_vshard_disabled_state',
{state}, { uri_list = uri_list }
)


local patch = {servers = {}}

for _, uuid in pairs(uuids) do
Expand Down Expand Up @@ -158,31 +218,9 @@ end
-- @treturn[2] table Error description
local function restart_replication(uuids)
checks('table')
local topology_cfg = confapplier.get_readonly('topology')

if topology_cfg == nil then
return nil, RestartReplicationError:new(
"Current instance isn't bootstrapped yet"
)
end

-- Prepare a server group to be operated
local uri_list = {}
local refined_uri_list = topology.refine_servers_uri(topology_cfg)
for _, uuid in ipairs(uuids) do
local srv = topology_cfg.servers[uuid]
if not srv then
return nil, RestartReplicationError:new(
'Server %s not in clusterwide config', uuid
)
elseif topology.disabled(uuid, srv) then
return nil, RestartReplicationError:new(
'Server %s is disabled, not suitable' ..
' for restarting replication', uuid
)
end

table.insert(uri_list, refined_uri_list[uuid])
local uri_list, err = get_uri_list(uuids)
if uri_list == nil then
return nil, err
end

local retmap, errmap = pool.map_call(
Expand Down
31 changes: 31 additions & 0 deletions cartridge/webui/api-suggestions.lua
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ local disable_servers_suggestion = gql_types.object({
}
})

local enable_servers_suggestion = gql_types.object({
name = 'EnableServerSuggestion',
description =
'A suggestion to enable all disabled servers',
fields = {
uuid = gql_types.string.nonNull,
}
})

local restart_replication_suggestion = gql_types.object({
name = 'RestartReplicationSuggestion',
description =
Expand Down Expand Up @@ -176,6 +185,25 @@ local function disable_servers(_, _, info)
return ret
end

local function enable_servers(_, _, info)
local topology_cfg = confapplier.get_readonly('topology')
if topology_cfg == nil then
return nil
end

local ret = {}

for _, uuid, _ in fun.filter(topology.disabled, topology_cfg.servers) do
table.insert(ret, {uuid = uuid})
end

if next(ret) == nil then
return nil
end

return ret
end

local function restart_replication(_, _, info)
local topology_cfg = confapplier.get_readonly('topology')
if topology_cfg == nil then
Expand Down Expand Up @@ -219,6 +247,7 @@ local function get_suggestions(_, _, info)
refine_uri = refine_uri(nil, nil, info),
force_apply = force_apply(nil, nil, info),
disable_servers = disable_servers(nil, nil, info),
enable_servers = enable_servers(nil, nil, info),
restart_replication = restart_replication(nil, nil, info),
}
end
Expand All @@ -236,6 +265,8 @@ local function init(graphql)
force_apply = gql_types.list(force_apply_suggestion.nonNull),
disable_servers =
gql_types.list(disable_servers_suggestion.nonNull),
enable_servers =
gql_types.list(enable_servers_suggestion.nonNull),
restart_replication =
gql_types.list(restart_replication_suggestion.nonNull),
}
Expand Down
16 changes: 16 additions & 0 deletions cartridge/webui/api-topology.lua
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ local function disable_servers(_, args)
return lua_api_topology.disable_servers(args.uuids)
end

local function enable_servers(_, args)
return lua_api_topology.enable_servers(args.uuids)
end

local function restart_replication(_, args)
return lua_api_topology.restart_replication(args.uuids)
end
Expand Down Expand Up @@ -433,6 +437,17 @@ local function init(graphql)
callback = module_name .. '.disable_servers',
})

graphql.add_mutation({
prefix = 'cluster',
name = 'enable_servers',
doc = 'Enable listed servers by uuid',
args = {
uuids = gql_types.list(gql_types.string.nonNull),
},
kind = gql_types.list('Server'),
callback = module_name .. '.enable_servers',
})

graphql.add_mutation({
prefix = 'cluster',
name = 'restart_replication',
Expand Down Expand Up @@ -500,6 +515,7 @@ return {
edit_replicaset = edit_replicaset, -- deprecated
expel_server = expel_server, -- deprecated
disable_servers = disable_servers,
enable_servers = enable_servers,
restart_replication = restart_replication,

edit_topology = edit_topology,
Expand Down
6 changes: 6 additions & 0 deletions rst/cartridge_admin.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1509,6 +1509,12 @@ Cartridge displays cluster and instances issues in WebUI:

|nbsp|

* Disk failure:

* **critical**: "Disk error on instance ...". When you see this issue,
instances will be disabled (on instances with vshard, vshard storage will
also be disabled) and you need to fix the disk issue manually.

* Custom issues (defined by user):

* Custom roles can announce more issues with their own level, topic
Expand Down
Loading

0 comments on commit 8dca1df

Please sign in to comment.