|
1 | 1 | local t = require('luatest')
|
2 | 2 | local vtest = require('test.luatest_helpers.vtest')
|
3 | 3 | local vutil = require('vshard.util')
|
| 4 | +local vconsts = require('vshard.consts') |
4 | 5 |
|
5 | 6 | local g = t.group('router')
|
6 | 7 | local cfg_template = {
|
@@ -861,3 +862,245 @@ g.test_request_timeout = function(g)
|
861 | 862 | _G.sleep_num = nil
|
862 | 863 | end)
|
863 | 864 | end
|
| 865 | + |
| 866 | +local function prepare_affect_priority_rs(g) |
| 867 | + local new_cfg_template = table.deepcopy(cfg_template) |
| 868 | + new_cfg_template.sharding[1].replicas.replica_1_a.zone = 3 |
| 869 | + new_cfg_template.sharding[1].replicas.replica_1_b.zone = 2 |
| 870 | + new_cfg_template.zone = 1 |
| 871 | + new_cfg_template.weights = { |
| 872 | + [1] = { |
| 873 | + [1] = 0, |
| 874 | + [2] = 1, |
| 875 | + [3] = 2, |
| 876 | + }, |
| 877 | + } |
| 878 | + -- So that ping timeout is always > replica.net_timeout. |
| 879 | + -- net_timeout starts with CALL_TIMEOUT_MIN and is mutiplied by 2 if number |
| 880 | + -- of failed requests is >= 2. |
| 881 | + new_cfg_template.failover_ping_timeout = vconsts.CALL_TIMEOUT_MIN * 4 |
| 882 | + local new_cluster_cfg = vtest.config_new(new_cfg_template) |
| 883 | + vtest.router_cfg(g.router, new_cluster_cfg) |
| 884 | +end |
| 885 | + |
| 886 | +local function affect_priority_clear_net_timeout(g) |
| 887 | + g.router:exec(function() |
| 888 | + -- Reset net_timeout, so that it doesn't affect the test. This is |
| 889 | + -- needed as we use the absolute minimum failover_ping_timeout for |
| 890 | + -- FAILOVER_DOWN_SEQUENTIAL_FAIL = 3. 10 successful calls are needed |
| 891 | + -- to restore it to CALL_TIMEOUT_MIN wthout reset. |
| 892 | + local router = ivshard.router.internal.static_router |
| 893 | + for _, rs in pairs(router.replicasets) do |
| 894 | + for _, r in pairs(rs.replicas) do |
| 895 | + r.net_timeout = ivconst.CALL_TIMEOUT_MIN |
| 896 | + end |
| 897 | + end |
| 898 | + end) |
| 899 | +end |
| 900 | + |
| 901 | +-- |
| 902 | +-- gh-483: failover ping temporary lower replica's priority, when it cannot be |
| 903 | +-- reached several times in a row: |
| 904 | +-- |
| 905 | +-- 1. replica_1_b is the prioritized one. replica_1_a is the second one. |
| 906 | +-- 2. router establishes connection to all instances, failover sets prioritized |
| 907 | +-- replica_1_b. |
| 908 | +-- 3. Node breaks and stops to respond. |
| 909 | +-- 4. Failover retries ping FAILOVER_DOWN_SEQUENTIAL_FAIL times and changes |
| 910 | +-- prioritized replica to the lower one. Note, that connection is recreated |
| 911 | +-- on every failed ping. |
| 912 | +-- 5. Every FAILOVER_UP_TIMEOUT failover checks, if any replica with higher |
| 913 | +-- priority can be reached and changes the prioritized replica if it's so. |
| 914 | +-- |
| 915 | +g.test_failover_ping_affects_priority = function() |
| 916 | + prepare_affect_priority_rs(g) |
| 917 | + |
| 918 | + -- Find prioritized replica and disable failover for now. |
| 919 | + g.router:exec(function(rs_uuid, replica_uuid) |
| 920 | + local router = ivshard.router.internal.static_router |
| 921 | + local rs = router.replicasets[rs_uuid] |
| 922 | + local opts = {timeout = iwait_timeout} |
| 923 | + rs:wait_connected_all(opts) |
| 924 | + |
| 925 | + t.helpers.retrying(opts, function() |
| 926 | + router.failover_fiber:wakeup() |
| 927 | + t.assert_equals(rs.replica.uuid, replica_uuid, |
| 928 | + 'Prioritized replica have not been set yet') |
| 929 | + end) |
| 930 | + |
| 931 | + local errinj = ivshard.router.internal.errinj |
| 932 | + errinj.ERRINJ_FAILOVER_DELAY = true |
| 933 | + t.helpers.retrying(opts, function() |
| 934 | + router.failover_fiber:wakeup() |
| 935 | + t.assert_equals(errinj.ERRINJ_FAILOVER_DELAY, 'in', |
| 936 | + 'Failover have not been stopped yet') |
| 937 | + end) |
| 938 | + end, {g.replica_1_b:replicaset_uuid(), g.replica_1_b:instance_uuid()}) |
| 939 | + |
| 940 | + -- Break 'info' request on replica so that it fails with TimedOut error. |
| 941 | + g.replica_1_b:exec(function() |
| 942 | + rawset(_G, 'old_call', ivshard.storage._call) |
| 943 | + ivshard.storage._call = function(service_name, ...) |
| 944 | + if service_name == 'info' then |
| 945 | + ifiber.sleep(ivconst.CALL_TIMEOUT_MIN * 5) |
| 946 | + end |
| 947 | + return _G.old_call(service_name, ...) |
| 948 | + end |
| 949 | + end) |
| 950 | + |
| 951 | + affect_priority_clear_net_timeout(g) |
| 952 | + g.router:exec(function(rs_uuid, master_uuid) |
| 953 | + local router = ivshard.router.internal.static_router |
| 954 | + local rs = router.replicasets[rs_uuid] |
| 955 | + |
| 956 | + -- And we change the prioritized replica. |
| 957 | + ivshard.router.internal.errinj.ERRINJ_FAILOVER_DELAY = false |
| 958 | + t.helpers.retrying({timeout = iwait_timeout}, function() |
| 959 | + router.failover_fiber:wakeup() |
| 960 | + t.assert_equals(rs.replica.uuid, master_uuid) |
| 961 | + end) |
| 962 | + |
| 963 | + -- Check, that prioritized replica is not changed, as it's still broken. |
| 964 | + rawset(_G, 'old_up_timeout', ivconst.FAILOVER_UP_TIMEOUT) |
| 965 | + ivconst.FAILOVER_UP_TIMEOUT = 0.01 |
| 966 | + ivtest.service_wait_for_new_ok(router.failover_service, |
| 967 | + {on_yield = router.failover_fiber:wakeup()}) |
| 968 | + t.assert_equals(rs.replica.uuid, master_uuid) |
| 969 | + end, {g.replica_1_b:replicaset_uuid(), g.replica_1_a:instance_uuid()}) |
| 970 | + |
| 971 | + -- Restore 'info' request. |
| 972 | + g.replica_1_b:exec(function() |
| 973 | + ivshard.storage._call = _G.old_call |
| 974 | + _G.old_call = nil |
| 975 | + end) |
| 976 | + |
| 977 | + -- As replica_1_b has higher priority, it should be restored automatically. |
| 978 | + g.router:exec(function(rs_uuid, replica_uuid) |
| 979 | + local router = ivshard.router.internal.static_router |
| 980 | + local rs = router.replicasets[rs_uuid] |
| 981 | + t.assert_equals(rs.priority_list[1].uuid, replica_uuid) |
| 982 | + t.helpers.retrying({timeout = iwait_timeout}, function() |
| 983 | + router.failover_fiber:wakeup() |
| 984 | + t.assert_equals(rs.replica.uuid, replica_uuid, |
| 985 | + 'Prioritized replica is not up yet') |
| 986 | + end) |
| 987 | + end, {g.replica_1_b:replicaset_uuid(), g.replica_1_b:instance_uuid()}) |
| 988 | + |
| 989 | + vtest.router_cfg(g.router, global_cfg) |
| 990 | + g.router:exec(function() |
| 991 | + ivconst.FAILOVER_UP_TIMEOUT = _G.old_up_timeout |
| 992 | + _G.old_up_timeout = nil |
| 993 | + end) |
| 994 | +end |
| 995 | + |
| 996 | +-- |
| 997 | +-- gh-483: user calls also affects priority. If several sequential requests |
| 998 | +-- fail, then the same logic as in the previous test happens. |
| 999 | +-- |
| 1000 | +g.test_failed_calls_affect_priority = function() |
| 1001 | + prepare_affect_priority_rs(g) |
| 1002 | + local timeout = vconsts.CALL_TIMEOUT_MIN * 4 |
| 1003 | + |
| 1004 | + -- Find prioritized replica and disable failover for now. |
| 1005 | + g.router:exec(function(rs_uuid, replica_uuid) |
| 1006 | + local router = ivshard.router.internal.static_router |
| 1007 | + local rs = router.replicasets[rs_uuid] |
| 1008 | + local opts = {timeout = iwait_timeout} |
| 1009 | + rs:wait_connected_all(opts) |
| 1010 | + |
| 1011 | + t.helpers.retrying(opts, function() |
| 1012 | + router.failover_fiber:wakeup() |
| 1013 | + t.assert_equals(rs.replica.uuid, replica_uuid, |
| 1014 | + 'Prioritized replica have not been set yet') |
| 1015 | + end) |
| 1016 | + |
| 1017 | + local errinj = ivshard.router.internal.errinj |
| 1018 | + errinj.ERRINJ_FAILOVER_DELAY = true |
| 1019 | + t.helpers.retrying(opts, function() |
| 1020 | + router.failover_fiber:wakeup() |
| 1021 | + t.assert_equals(errinj.ERRINJ_FAILOVER_DELAY, 'in', |
| 1022 | + 'Failover have not been stopped yet') |
| 1023 | + end) |
| 1024 | + |
| 1025 | + -- Discovery is disabled, as it may affect `net_sequential_fail` |
| 1026 | + -- and leads to flakiness of the test. |
| 1027 | + errinj.ERRINJ_LONG_DISCOVERY = true |
| 1028 | + t.helpers.retrying(opts, function() |
| 1029 | + router.discovery_fiber:wakeup() |
| 1030 | + t.assert_equals(errinj.ERRINJ_LONG_DISCOVERY, 'waiting', |
| 1031 | + 'Discovery have not been stopped yet') |
| 1032 | + end) |
| 1033 | + end, {g.replica_1_b:replicaset_uuid(), g.replica_1_b:instance_uuid()}) |
| 1034 | + |
| 1035 | + -- Break 'info' request on replica so that it fails with TimedOut error. |
| 1036 | + -- No other request can be broken, as only failover changes priority and |
| 1037 | + -- as soon as it wakes up it succeeds with `_call` and sets |
| 1038 | + -- `net_sequential_fail` to 0. |
| 1039 | + g.replica_1_b:exec(function() |
| 1040 | + rawset(_G, 'old_call', ivshard.storage._call) |
| 1041 | + ivshard.storage._call = function(service_name, ...) |
| 1042 | + if service_name == 'info' then |
| 1043 | + ifiber.sleep(ivconst.CALL_TIMEOUT_MIN * 5) |
| 1044 | + end |
| 1045 | + return _G.old_call(service_name, ...) |
| 1046 | + end |
| 1047 | + end) |
| 1048 | + |
| 1049 | + affect_priority_clear_net_timeout(g) |
| 1050 | + local bid = vtest.storage_first_bucket(g.replica_1_a) |
| 1051 | + g.router:exec(function(bid, timeout, rs_uuid, replica_uuid) |
| 1052 | + local router = ivshard.router.internal.static_router |
| 1053 | + local replica = router.replicasets[rs_uuid].replica |
| 1054 | + t.assert_equals(replica.uuid, replica_uuid) |
| 1055 | + |
| 1056 | + local fails = replica.net_sequential_fail |
| 1057 | + for _ = 1, ivconst.FAILOVER_DOWN_SEQUENTIAL_FAIL do |
| 1058 | + local ok, err = ivshard.router.callro(bid, 'vshard.storage._call', |
| 1059 | + {'info'}, {timeout = timeout}) |
| 1060 | + t.assert_not(ok) |
| 1061 | + t.assert(iverror.is_timeout(err)) |
| 1062 | + end |
| 1063 | + |
| 1064 | + t.assert_equals(replica.net_sequential_fail, |
| 1065 | + fails + ivconst.FAILOVER_DOWN_SEQUENTIAL_FAIL) |
| 1066 | + |
| 1067 | + -- Priority is changed only by failover. So, the prioritized replica |
| 1068 | + -- is still the failing one. |
| 1069 | + t.assert_equals(router.replicasets[rs_uuid].replica.uuid, replica_uuid) |
| 1070 | + end, {bid, timeout, g.replica_1_b:replicaset_uuid(), |
| 1071 | + g.replica_1_b:instance_uuid()}) |
| 1072 | + |
| 1073 | + -- Enable failover, which changes priority of the replica. |
| 1074 | + g.router:exec(function(rs_uuid, master_uuid) |
| 1075 | + local router = ivshard.router.internal.static_router |
| 1076 | + ivshard.router.internal.errinj.ERRINJ_FAILOVER_DELAY = false |
| 1077 | + t.helpers.retrying({timeout = iwait_timeout}, function() |
| 1078 | + router.failover_fiber:wakeup() |
| 1079 | + t.assert_equals(router.replicasets[rs_uuid].replica.uuid, |
| 1080 | + master_uuid, 'Master is not prioritized yet') |
| 1081 | + end) |
| 1082 | + end, {g.replica_1_b:replicaset_uuid(), g.replica_1_a:instance_uuid()}) |
| 1083 | + |
| 1084 | + -- Restore 'info' request. |
| 1085 | + g.replica_1_b:exec(function() |
| 1086 | + ivshard.storage._call = _G.old_call |
| 1087 | + _G.old_call = nil |
| 1088 | + end) |
| 1089 | + |
| 1090 | + -- As replica_1_b has higher priority, it should be restored automatically. |
| 1091 | + g.router:exec(function(rs_uuid, replica_uuid) |
| 1092 | + local old_up_timeout = ivconst.FAILOVER_UP_TIMEOUT |
| 1093 | + ivconst.FAILOVER_UP_TIMEOUT = 1 |
| 1094 | + local router = ivshard.router.internal.static_router |
| 1095 | + local rs = router.replicasets[rs_uuid] |
| 1096 | + t.assert_equals(rs.priority_list[1].uuid, replica_uuid) |
| 1097 | + t.helpers.retrying({timeout = iwait_timeout}, function() |
| 1098 | + router.failover_fiber:wakeup() |
| 1099 | + t.assert_equals(rs.replica.uuid, replica_uuid, |
| 1100 | + 'Prioritized replica is not up yet') |
| 1101 | + end) |
| 1102 | + ivconst.FAILOVER_UP_TIMEOUT = old_up_timeout |
| 1103 | + end, {g.replica_1_b:replicaset_uuid(), g.replica_1_b:instance_uuid()}) |
| 1104 | + |
| 1105 | + vtest.router_cfg(g.router, global_cfg) |
| 1106 | +end |
0 commit comments