Disable on disk failure

tarantool · Apr 22, 2024 · 8dca1df · 8dca1df
1 parent a996de4
commit 8dca1df
Show file tree

Hide file tree

Showing 7 changed files with 286 additions and 32 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -12,6 +12,14 @@ and this project adheres to
 Unreleased
 -------------------------------------------------------------------------------
 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Added
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Disk failure check. If there is something wrong with the disk, the instance
+  will be disabled automatically and the corresponding issue will be shown
+  in the WebUI.
+
 -------------------------------------------------------------------------------
 [2.10.0] - 2024-04-10
 -------------------------------------------------------------------------------

diff --git a/cartridge/issues.lua b/cartridge/issues.lua
@@ -74,6 +74,10 @@
 --
 -- * critical: "All instances are unhealthy in replicaset ... ".
 --
+-- Disk failures:
+--
+-- * critical: "Disk error on instance ... ".
+--
 -- Custom issues (defined by user):
 --
 -- * Custom roles can announce more issues with their own level, topic
@@ -520,15 +524,14 @@ local function list_on_instance(opts)
     if type(box.cfg) == 'table' and not fio.lstat(box.cfg.memtx_dir) then
         table.insert(ret, {
             level = 'critical',
-            topic = 'disk_error',
+            topic = 'disk_failure',
             instance_uuid = instance_uuid,
             replicaset_uuid = replicaset_uuid,
             message = string.format(
-                'Disk error on instance %s',
+                'Disk error on instance %s. This issue stays until restart',
                 instance_uuid
             ),
         })
-        vshard.storage.disable()
     end
 
     -- add custom issues from each role
@@ -552,6 +555,7 @@ local function list_on_instance(opts)
     return ret
 end
 
+local disk_failure_cache = {}
 local function list_on_cluster()
     local state, err = confapplier.get_state()
     if state == 'Unconfigured' and lua_api_proxy.can_call()  then
@@ -720,16 +724,21 @@ local function list_on_cluster()
         {uri_list = uri_list, timeout = 1}
     )
 
-    local disk_error_uuids = {}
+    local disk_failure_uuids = {}
     for _, issues in pairs(issues_map) do
         for _, issue in pairs(issues) do
             table.insert(ret, issue)
-            if issue.topic == 'disk_error' then
-                table.insert(disk_error_uuids, issue.instance_uuid)
+            if issue.topic == 'disk_failure' then
+                table.insert(disk_failure_uuids, issue.instance_uuid)
+                disk_failure_cache[issue.instance_uuid] = issue
             end
         end
     end
-    lua_api_topology.disable_servers(disk_error_uuids)
+    for _, issue in pairs(disk_failure_cache) do
+        table.insert(ret, issue)
+    end
+
+    lua_api_topology.disable_servers(disk_failure_uuids)
 
     -- to use this counter in tarantool/metrics
     rawset(_G, '__cartridge_issues_cnt', #ret)

diff --git a/cartridge/lua-api/topology.lua b/cartridge/lua-api/topology.lua
@@ -14,6 +14,9 @@ local lua_api_edit_topology = require('cartridge.lua-api.edit-topology')
 
 local ProbeServerError = errors.new_class('ProbeServerError')
 local RestartReplicationError = errors.new_class('RestartReplicationError')
+local RefineUriError = errors.new_class('RefineUriError')
+local DisableServerError = errors.new_class('DisableServerError')
+
 
 --- Get alias, uri and uuid of current instance.
 -- @function get_self
@@ -105,8 +108,65 @@ local function probe_server(uri)
     return true
 end
 
+local function get_uri_list(uuids, only_enabled)
+    if only_enabled == nil then
+        only_enabled = true
+    end
+    local topology_cfg = confapplier.get_readonly('topology')
+
+    if topology_cfg == nil then
+        return nil, RefineUriError:new(
+            "Current instance isn't bootstrapped yet"
+        )
+    end
+
+    local uri_list = {}
+    local refined_uri_list = topology.refine_servers_uri(topology_cfg)
+    for _, uuid in ipairs(uuids) do
+        local srv = topology_cfg.servers[uuid]
+        if not srv then
+            return nil, RefineUriError:new(
+                'Server %s not in clusterwide config', uuid
+            )
+        elseif only_enabled and topology.disabled(uuid, srv) then
+            return nil, RefineUriError:new(
+                'Server %s is disabled', uuid
+            )
+        end
+
+        table.insert(uri_list, refined_uri_list[uuid])
+    end
+    return uri_list
+end
+
+
+local function __cartridge_set_vshard_disabled_state(state)
+    local vshard = rawget(_G, 'vshard')
+    if vshard == nil then
+        return
+    end
+    if state == true and vshard.storage.internal.is_enabled then
+        vshard.storage.disable()
+    elseif state == false and not vshard.storage.internal.is_enabled then
+        vshard.storage.enable()
+    end
+end
+rawset(_G, '__cartridge_set_vshard_disabled_state', __cartridge_set_vshard_disabled_state)
+
 local function __set_servers_disabled_state(uuids, state)
     checks('table', 'boolean')
+
+    local uri_list, err = get_uri_list(uuids, state)
+    if uri_list == nil then
+        return nil, err
+    end
+
+    pool.map_call(
+        '_G.__cartridge_set_vshard_disabled_state',
+        {state}, { uri_list = uri_list }
+    )
+
+
     local patch = {servers = {}}
 
     for _, uuid in pairs(uuids) do
@@ -158,31 +218,9 @@ end
 -- @treturn[2] table Error description
 local function restart_replication(uuids)
     checks('table')
-    local topology_cfg = confapplier.get_readonly('topology')
-
-    if topology_cfg == nil then
-        return nil, RestartReplicationError:new(
-            "Current instance isn't bootstrapped yet"
-        )
-    end
-
-    -- Prepare a server group to be operated
-    local uri_list = {}
-    local refined_uri_list = topology.refine_servers_uri(topology_cfg)
-    for _, uuid in ipairs(uuids) do
-        local srv = topology_cfg.servers[uuid]
-        if not srv then
-            return nil, RestartReplicationError:new(
-                'Server %s not in clusterwide config', uuid
-            )
-        elseif topology.disabled(uuid, srv) then
-            return nil, RestartReplicationError:new(
-                'Server %s is disabled, not suitable' ..
-                ' for restarting replication', uuid
-            )
-        end
-
-        table.insert(uri_list, refined_uri_list[uuid])
+    local uri_list, err = get_uri_list(uuids)
+    if uri_list == nil then
+        return nil, err
     end
 
     local retmap, errmap = pool.map_call(

diff --git a/cartridge/webui/api-suggestions.lua b/cartridge/webui/api-suggestions.lua
@@ -46,6 +46,15 @@ local disable_servers_suggestion = gql_types.object({
     }
 })
 
+local enable_servers_suggestion = gql_types.object({
+    name = 'EnableServerSuggestion',
+    description =
+        'A suggestion to enable all disabled servers',
+    fields = {
+        uuid = gql_types.string.nonNull,
+    }
+})
+
 local restart_replication_suggestion = gql_types.object({
     name = 'RestartReplicationSuggestion',
     description =
@@ -176,6 +185,25 @@ local function disable_servers(_, _, info)
     return ret
 end
 
+local function enable_servers(_, _, info)
+    local topology_cfg = confapplier.get_readonly('topology')
+    if topology_cfg == nil then
+        return nil
+    end
+
+    local ret = {}
+
+    for _, uuid, _ in fun.filter(topology.disabled, topology_cfg.servers) do
+        table.insert(ret, {uuid = uuid})
+    end
+
+    if next(ret) == nil then
+        return nil
+    end
+
+    return ret
+end
+
 local function restart_replication(_, _, info)
     local topology_cfg = confapplier.get_readonly('topology')
     if topology_cfg == nil then
@@ -219,6 +247,7 @@ local function get_suggestions(_, _, info)
         refine_uri = refine_uri(nil, nil, info),
         force_apply = force_apply(nil, nil, info),
         disable_servers = disable_servers(nil, nil, info),
+        enable_servers = enable_servers(nil, nil, info),
         restart_replication = restart_replication(nil, nil, info),
     }
 end
@@ -236,6 +265,8 @@ local function init(graphql)
                 force_apply = gql_types.list(force_apply_suggestion.nonNull),
                 disable_servers =
                     gql_types.list(disable_servers_suggestion.nonNull),
+                enable_servers =
+                    gql_types.list(enable_servers_suggestion.nonNull),
                 restart_replication =
                     gql_types.list(restart_replication_suggestion.nonNull),
             }

diff --git a/cartridge/webui/api-topology.lua b/cartridge/webui/api-topology.lua
@@ -303,6 +303,10 @@ local function disable_servers(_, args)
     return lua_api_topology.disable_servers(args.uuids)
 end
 
+local function enable_servers(_, args)
+    return lua_api_topology.enable_servers(args.uuids)
+end
+
 local function restart_replication(_, args)
     return lua_api_topology.restart_replication(args.uuids)
 end
@@ -433,6 +437,17 @@ local function init(graphql)
         callback = module_name .. '.disable_servers',
     })
 
+    graphql.add_mutation({
+        prefix = 'cluster',
+        name = 'enable_servers',
+        doc = 'Enable listed servers by uuid',
+        args = {
+            uuids = gql_types.list(gql_types.string.nonNull),
+        },
+        kind = gql_types.list('Server'),
+        callback = module_name .. '.enable_servers',
+    })
+
     graphql.add_mutation({
         prefix = 'cluster',
         name = 'restart_replication',
@@ -500,6 +515,7 @@ return {
     edit_replicaset = edit_replicaset, -- deprecated
     expel_server = expel_server, -- deprecated
     disable_servers = disable_servers,
+    enable_servers = enable_servers,
     restart_replication = restart_replication,
 
     edit_topology = edit_topology,

diff --git a/rst/cartridge_admin.rst b/rst/cartridge_admin.rst
@@ -1509,6 +1509,12 @@ Cartridge displays cluster and instances issues in WebUI:
 
     |nbsp|
 
+*   Disk failure:
+
+    * **critical**: "Disk error on instance ...". When you see this issue,
+      instances will be disabled (on instances with vshard, vshard storage will
+      also be disabled) and you need to fix the disk issue manually.
+
 *   Custom issues (defined by user):
 
     * Custom roles can announce more issues with their own level, topic