Skip to content

Commit bf9e6a9

Browse files
Serpentiansergepetrenko
authored andcommitted
gc: fix wait_lsn function
GC goes to replicas to check if a SENT bucket can be deleted. But it can happen that this call via netbox was faster than the replication. Some replicas still can have the bucket SENDING. So, GC tries to sync with replicas before checking the state of buckets. However, wait_lsn function, which is used in GC and vshard.storage.sync() tried to sync with all replicas in the current replicaset, without considering vshard config. In case some CDC is connected as non-anonymous replica, GC process failed. Let's make wait_lsn function consider vshard config. It must sync only with instances, which were explicitly passed to the config and which we know to be vshard storages. However, it's not always possible to figure out from the box.info.replication, whether the instance is vshard storage. This happens, e.g. when named identification is used, but names have not been set yet and UUID have not been passed to configuration. In such case we fallback to the old behavior and anyway check the downstream of such replica, even if we're not sure, that instance is in our configuration. Closes #490 NO_DOC=bugfix
1 parent c8f56cc commit bf9e6a9

File tree

2 files changed

+90
-0
lines changed

2 files changed

+90
-0
lines changed

test/storage-luatest/storage_1_1_test.lua

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ local t = require('luatest')
22
local vconst = require('vshard.consts')
33
local vtest = require('test.luatest_helpers.vtest')
44
local vutil = require('vshard.util')
5+
local lserver = require('test.luatest_helpers.server')
56

67
local group_config = {{engine = 'memtx'}, {engine = 'vinyl'}}
78

@@ -531,3 +532,61 @@ test_group.test_noactivity_timeout_for_explicit_master = function(g)
531532
_G.bucket_gc_wait()
532533
end, {g.replica_1_a:replicaset_uuid(), bid})
533534
end
535+
536+
--
537+
-- gh-490: wait_lsn function didn't take into account, that there may
538+
-- be non anonymous replicas in box.info.replication, which are not
539+
-- accessible (e.g. CDC). Such replica broke GC process.
540+
--
541+
test_group.test_gc_with_non_anon_replica_in_cluster = function(g)
542+
local box_cfg = {}
543+
box_cfg.replication = {g.replica_1_a.net_box_uri}
544+
local server = lserver:new({box_cfg = box_cfg, alias = 'cdc'})
545+
server:start()
546+
local id = server:instance_id()
547+
server:drop()
548+
549+
vtest.cluster_exec_each_master(g, function(engine)
550+
local format = {
551+
{'id', 'unsigned'},
552+
{'bid', 'unsigned'},
553+
}
554+
local s = box.schema.create_space('test', {
555+
engine = engine,
556+
format = format,
557+
})
558+
s:create_index('pk')
559+
s:create_index('bucket_id', {unique = false, parts = {2}})
560+
end, {g.params.engine})
561+
562+
local bid = g.replica_1_a:exec(function(dst)
563+
local opts = {timeout = iwait_timeout}
564+
local bid = _G.get_first_bucket()
565+
-- Bump vclock of the master.
566+
box.space.test:insert({1, bid})
567+
568+
-- Send bucket so that sent/garbage remains.
569+
local ok, err = ivshard.storage.bucket_send(bid, dst, opts)
570+
ilt.assert_equals(err, nil, 'bucket_send no error')
571+
ilt.assert(ok, 'bucket_send ok')
572+
573+
-- This failed before the patch.
574+
_G.bucket_gc_wait()
575+
ivshard.storage.sync()
576+
return bid
577+
end, {g.replica_2_a:replicaset_uuid()})
578+
579+
g.replica_2_a:exec(function(bid, dst)
580+
local opts = {timeout = iwait_timeout}
581+
local ok, err = ivshard.storage.bucket_send(bid, dst, opts)
582+
ilt.assert_equals(err, nil, 'bucket_send no error')
583+
ilt.assert(ok, 'bucket_send ok')
584+
end, {bid, g.replica_1_a:replicaset_uuid()})
585+
586+
g.replica_1_a:exec(function(id)
587+
box.space._cluster:delete(id)
588+
end, {id})
589+
vtest.cluster_exec_each_master(g, function()
590+
box.space.test:drop()
591+
end)
592+
end

vshard/storage/init.lua

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,14 +1140,37 @@ end
11401140
local function vclock_lesseq(vc1, vc2)
11411141
local lesseq = true
11421142
for i, lsn in ipairs(vc1) do
1143+
if i == 0 then
1144+
-- Skip local component.
1145+
goto continue
1146+
end
11431147
lesseq = lesseq and lsn <= (vc2[i] or 0)
11441148
if not lesseq then
11451149
break
11461150
end
1151+
::continue::
11471152
end
11481153
return lesseq
11491154
end
11501155

1156+
local function is_replica_in_configuration(replica)
1157+
local is_named = M.this_replica.id == M.this_replica.name
1158+
local id = is_named and replica.name or replica.uuid
1159+
if id ~= nil then
1160+
-- In most cases name or uuid are properly set.
1161+
return M.this_replicaset.replicas[id] ~= nil
1162+
end
1163+
1164+
-- When named identification is used it's possible, that names
1165+
-- have not been set yet: we're working on top of schema < 3.0.0.
1166+
-- If user passed uuid to the configuration, then we can validate
1167+
-- such replica by checking UUIDs of replicaset.replicas, but we
1168+
-- won't do that, since if user didn't specify the uuid, then it's
1169+
-- anyway better to check the downstream rather than fail immediately,
1170+
-- in most cases it'll pass.
1171+
return true
1172+
end
1173+
11511174
local function wait_lsn(timeout, interval)
11521175
local info = box.info
11531176
local current_id = info.id
@@ -1163,6 +1186,14 @@ local function wait_lsn(timeout, interval)
11631186
if replica.id == current_id then
11641187
goto continue
11651188
end
1189+
1190+
-- We must validate, that we're checking downstream of the replica.
1191+
-- which was actually configured as vshard storage and not some CDC.
1192+
-- If we're not sure, then we'll anyway check it.
1193+
if not is_replica_in_configuration(replica) then
1194+
goto continue
1195+
end
1196+
11661197
local down = replica.downstream
11671198
if not down or (down.status == 'stopped' or
11681199
not vclock_lesseq(vclock, down.vclock)) then

0 commit comments

Comments
 (0)