From a1b4d57d920a50bd5051191526383ca03ea2eac2 Mon Sep 17 00:00:00 2001 From: Igor Zolotarev Date: Wed, 26 Jul 2023 16:46:50 +0300 Subject: [PATCH 1/4] Add issue about using sync spaces with wrong failover --- CHANGELOG.rst | 2 ++ cartridge/failover.lua | 18 +++++++++++ cartridge/issues.lua | 33 +++++++++++++++++++-- test/integration/failover_eventual_test.lua | 28 +++++++++++++++++ 4 files changed, 79 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9bdfab05b..7783c935e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -35,6 +35,8 @@ Added - New Failover API function ``set_options`` to change failover internal params. +- Issue about sync spaces usage with a wrong failover setup. + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Changed ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/cartridge/failover.lua b/cartridge/failover.lua index f929c39d7..c960f8409 100644 --- a/cartridge/failover.lua +++ b/cartridge/failover.lua @@ -1025,6 +1025,14 @@ local function is_suppressed() return vars.failover_suppressed end +--- Check if failover synchro mode enabled. +-- @function is_synchro_mode_enabled +-- @local +-- @treturn boolean true / false +local function is_synchro_mode_enabled() + return vars.enable_sychro_mode +end + --- Check if current configuration implies consistent switchover. -- @function consistency_needed -- @local @@ -1033,6 +1041,14 @@ local function consistency_needed() return vars.consistency_needed end +--- Get failover mode. +-- @function mode +-- @local +-- @treturn string +local function mode() + return vars.mode +end + --- Get current stateful failover coordinator -- @function get_coordinator -- @treturn[1] table coordinator @@ -1182,6 +1198,8 @@ return { is_rw = is_rw, is_paused = is_paused, is_suppressed = is_suppressed, + is_synchro_mode_enabled = is_synchro_mode_enabled, + mode = mode, force_inconsistency = force_inconsistency, wait_consistency = wait_consistency, diff --git a/cartridge/issues.lua b/cartridge/issues.lua index 1aaac72ee..8bcb5203d 100644 --- a/cartridge/issues.lua +++ b/cartridge/issues.lua @@ -305,6 +305,36 @@ local function list_on_instance(opts) }) end + if box.ctl.promote ~= nil + and failover.is_leader() + and (failover.mode() == 'eventual' + or (failover.mode() == 'stateful' and not failover.is_synchro_mode_enabled())) then + local has_sync_spaces = false + local n = 0 + for _, tuple in box.space._space:pairs(512, {iterator = 'GE'}) do + n = n + 1 + if n % 500 == 0 then + fiber.yield() + end + if tuple.flags.is_sync then + has_sync_spaces = true + break + end + end + if has_sync_spaces then + table.insert(ret, { + level = 'warning', + topic = 'failover', + instance_uuid = instance_uuid, + replicaset_uuid = replicaset_uuid, + message = 'Having sync spaces may cause failover errors. ' .. + 'Consider to change failover type to stateful and enable synchro_mode or use ' .. + 'raft failover mode' + }) + end + end + + -- It should be a vclockkeeper, but it's not if failover.consistency_needed() and failover.get_active_leaders()[replicaset_uuid] == instance_uuid @@ -557,8 +587,7 @@ local function list_on_cluster() -- Check stateful failover issues - local failover_cfg = topology.get_failover_params(topology_cfg) - if failover_cfg.mode == 'stateful' then + if failover.mode() == 'stateful' then local coordinator, err = failover.get_coordinator() if err ~= nil then diff --git a/test/integration/failover_eventual_test.lua b/test/integration/failover_eventual_test.lua index 3b2869024..9abc17271 100644 --- a/test/integration/failover_eventual_test.lua +++ b/test/integration/failover_eventual_test.lua @@ -709,6 +709,34 @@ g.after_test('test_sigstop', function() cluster:wait_until_healthy() end) + +function g.test_sync_spaces_is_prohibited() + t.skip_if(not helpers.tarantool_version_ge('2.6.1')) + local master = cluster:server('storage-1') + master:exec(function() + box.schema.space.create('test', {if_not_exists = true, is_sync=true}) + end) + + t.assert_items_equals(helpers.list_cluster_issues(master), { + { + level = 'warning', + topic = 'failover', + message = 'Having sync spaces may cause failover errors. ' .. + 'Consider to change failover type to stateful and enable synchro_mode or use ' .. + 'raft failover mode', + instance_uuid = master.instance_uuid, + replicaset_uuid = master.replicaset_uuid, + }, + }) +end + +g.after_test('test_sync_spaces_is_prohibited', function() + cluster:server('storage-1'):exec(function() + box.space.test:drop() + end) +end) + + local function failover_pause() cluster.main_server:graphql({ query = [[ From 92b252eb94ee7df1c6fb5316d21be61dbd1857e6 Mon Sep 17 00:00:00 2001 From: Igor Zolotarev Date: Tue, 1 Aug 2023 13:19:17 +0300 Subject: [PATCH 2/4] Add triggers --- cartridge/confapplier.lua | 3 + cartridge/issues.lua | 40 ++++------- cartridge/sync-spaces.lua | 75 +++++++++++++++++++++ test/integration/failover_eventual_test.lua | 29 ++++---- 4 files changed, 109 insertions(+), 38 deletions(-) create mode 100644 cartridge/sync-spaces.lua diff --git a/cartridge/confapplier.lua b/cartridge/confapplier.lua index 201a10b3e..f3481eed7 100644 --- a/cartridge/confapplier.lua +++ b/cartridge/confapplier.lua @@ -29,6 +29,7 @@ local cluster_cookie = require('cartridge.cluster-cookie') local ClusterwideConfig = require('cartridge.clusterwide-config') local logging_whitelist = require('cartridge.logging_whitelist') local invalid_format = require('cartridge.invalid-format') +local sync_spaces = require('cartridge.sync-spaces') yaml.cfg({ encode_load_metatables = false, @@ -558,12 +559,14 @@ local function boot_instance(clusterwide_config) -- It recovers snapshot -- Or bootstraps replication invalid_format.start_check() + sync_spaces.start_check() local snap1 = hotreload.snap_fibers() box.cfg(box_opts) local snap2 = hotreload.snap_fibers() hotreload.whitelist_fibers(hotreload.diff(snap1, snap2)) invalid_format.end_check() + sync_spaces.end_check() require('membership.options').SUSPICIOUSNESS = true local username = cluster_cookie.username() diff --git a/cartridge/issues.lua b/cartridge/issues.lua index 8bcb5203d..d37047b31 100644 --- a/cartridge/issues.lua +++ b/cartridge/issues.lua @@ -96,6 +96,7 @@ local failover = require('cartridge.failover') local confapplier = require('cartridge.confapplier') local lua_api_proxy = require('cartridge.lua-api.proxy') local invalid_format = require('cartridge.invalid-format') +local sync_spaces = require('cartridge.sync-spaces') local ValidateConfigError = errors.new_class('ValidateConfigError') @@ -305,33 +306,20 @@ local function list_on_instance(opts) }) end - if box.ctl.promote ~= nil - and failover.is_leader() + local sync_spaces_list = sync_spaces.spaces_list_str() + if sync_spaces_list ~= '' + and (failover.is_leader() and (failover.mode() == 'eventual' - or (failover.mode() == 'stateful' and not failover.is_synchro_mode_enabled())) then - local has_sync_spaces = false - local n = 0 - for _, tuple in box.space._space:pairs(512, {iterator = 'GE'}) do - n = n + 1 - if n % 500 == 0 then - fiber.yield() - end - if tuple.flags.is_sync then - has_sync_spaces = true - break - end - end - if has_sync_spaces then - table.insert(ret, { - level = 'warning', - topic = 'failover', - instance_uuid = instance_uuid, - replicaset_uuid = replicaset_uuid, - message = 'Having sync spaces may cause failover errors. ' .. - 'Consider to change failover type to stateful and enable synchro_mode or use ' .. - 'raft failover mode' - }) - end + or (failover.mode() == 'stateful' and not failover.is_synchro_mode_enabled()))) then + table.insert(ret, { + level = 'warning', + topic = 'failover', + instance_uuid = instance_uuid, + replicaset_uuid = replicaset_uuid, + message = 'Having sync spaces may cause failover errors. ' .. + 'Consider to change failover type to stateful and enable synchro_mode or use ' .. + 'raft failover mode. Sync spaces: ' .. sync_spaces_list + }) end diff --git a/cartridge/sync-spaces.lua b/cartridge/sync-spaces.lua new file mode 100644 index 000000000..f57bb49b2 --- /dev/null +++ b/cartridge/sync-spaces.lua @@ -0,0 +1,75 @@ +local log = require('log') +local vars = require('cartridge.vars').new('cartridge.sync-spaces') + +vars:new('sync_spaces', {}) + +local function check_sync(old, new) + if new == nil then + vars.sync_spaces[old[3]] = nil + return + end + local name = new[3] + + if new[6].is_sync then + vars.sync_spaces[name] = true + else + vars.sync_spaces[name] = nil + end +end + +local function before_replace(old, new) + check_sync(old, new) + return new +end + +local function on_schema_init() + box.space._space:before_replace(before_replace) +end + +--- Check if spaces have invalid format. +-- +-- @function start_check +-- @treturn string String of spaces with invalid params, delimeted by comma +local function spaces_list_str() + local res = '' + for name, _ in pairs(vars.sync_spaces) do + res = res .. name .. ', ' + end + return res:sub(1, -3) +end + +--- Start check if spaces have invalid format in Tarantool 2.x.x. +-- In Tarantool 1.10 you can perform check by youself with function `run_check`. +-- +-- @function start_check +local function start_check() + if box.ctl.on_schema_init ~= nil then + box.ctl.on_schema_init(on_schema_init) + end +end + + +--- Remove set triggers and write message to log. +-- +-- @function end_check +local function end_check() + if box.ctl.on_schema_init ~= nil then + box.space._space:before_replace(nil, before_replace) + box.ctl.on_schema_init(nil, on_schema_init) + end + + if next(vars.sync_spaces) then + log.warn( + "Having sync spaces may cause failover errors. " .. + "Consider to change failover type to stateful and enable synchro_mode or use " .. + "raft failover mode. Sync spaces: " .. + spaces_list_str() + ) + end +end + +return { + start_check = start_check, + end_check = end_check, + spaces_list_str = spaces_list_str, +} diff --git a/test/integration/failover_eventual_test.lua b/test/integration/failover_eventual_test.lua index 9abc17271..0b096ca86 100644 --- a/test/integration/failover_eventual_test.lua +++ b/test/integration/failover_eventual_test.lua @@ -17,7 +17,7 @@ g.before_all = function() datadir = fio.tempdir(), use_vshard = true, server_command = helpers.entrypoint('srv_basic'), - cookie = helpers.random_cookie(), + cookie = 'secret', replicasets = { { alias = 'router', @@ -717,23 +717,28 @@ function g.test_sync_spaces_is_prohibited() box.schema.space.create('test', {if_not_exists = true, is_sync=true}) end) - t.assert_items_equals(helpers.list_cluster_issues(master), { - { - level = 'warning', - topic = 'failover', - message = 'Having sync spaces may cause failover errors. ' .. - 'Consider to change failover type to stateful and enable synchro_mode or use ' .. - 'raft failover mode', - instance_uuid = master.instance_uuid, - replicaset_uuid = master.replicaset_uuid, - }, - }) + master:restart() + + helpers.retrying({}, function() + t.assert_items_equals(helpers.list_cluster_issues(master), { + { + level = 'warning', + topic = 'failover', + message = 'Having sync spaces may cause failover errors. ' .. + 'Consider to change failover type to stateful and enable synchro_mode or use ' .. + 'raft failover mode. Sync spaces: test', + instance_uuid = master.instance_uuid, + replicaset_uuid = master.replicaset_uuid, + }, + }) + end) end g.after_test('test_sync_spaces_is_prohibited', function() cluster:server('storage-1'):exec(function() box.space.test:drop() end) + cluster:server('storage-1'):restart() end) From 3bec6ef460e0182cf76c3ec6d3e5fcd8c0ae3c5b Mon Sep 17 00:00:00 2001 From: Igor Zolotarev <63460867+yngvar-antonsson@users.noreply.github.com> Date: Wed, 2 Aug 2023 14:55:40 +0300 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Michael Filonenko --- cartridge/failover.lua | 2 +- test/integration/failover_eventual_test.lua | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cartridge/failover.lua b/cartridge/failover.lua index c960f8409..63b1beb65 100644 --- a/cartridge/failover.lua +++ b/cartridge/failover.lua @@ -1030,7 +1030,7 @@ end -- @local -- @treturn boolean true / false local function is_synchro_mode_enabled() - return vars.enable_sychro_mode + return vars.enable_synchro_mode end --- Check if current configuration implies consistent switchover. diff --git a/test/integration/failover_eventual_test.lua b/test/integration/failover_eventual_test.lua index 0b096ca86..83fbb2b29 100644 --- a/test/integration/failover_eventual_test.lua +++ b/test/integration/failover_eventual_test.lua @@ -17,7 +17,7 @@ g.before_all = function() datadir = fio.tempdir(), use_vshard = true, server_command = helpers.entrypoint('srv_basic'), - cookie = 'secret', + cookie = helpers.random_cookie(), replicasets = { { alias = 'router', From 342c9b56442745b41e675435888055f19f8be2ee Mon Sep 17 00:00:00 2001 From: Igor Zolotarev Date: Wed, 2 Aug 2023 15:03:12 +0300 Subject: [PATCH 4/4] Fix description; --- cartridge/sync-spaces.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cartridge/sync-spaces.lua b/cartridge/sync-spaces.lua index f57bb49b2..3ca27ab22 100644 --- a/cartridge/sync-spaces.lua +++ b/cartridge/sync-spaces.lua @@ -26,9 +26,9 @@ local function on_schema_init() box.space._space:before_replace(before_replace) end ---- Check if spaces have invalid format. +--- List sync spaces. -- --- @function start_check +-- @function spaces_list_str -- @treturn string String of spaces with invalid params, delimeted by comma local function spaces_list_str() local res = ''