Skip to content

Commit

Permalink
Implement optional interval and minimumFailure options in if*up.
Browse files Browse the repository at this point in the history
  • Loading branch information
miodvallat committed Dec 26, 2024
1 parent f244055 commit c588357
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 14 deletions.
4 changes: 4 additions & 0 deletions docs/lua-records/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ Record creation functions
- ``backupSelector``: used to pick the address(es) from all addresses if all addresses are down. Choices include 'pickclosest', 'random', 'hashed', 'all' (default 'random').
- ``source``: Source address to check from
- ``timeout``: Maximum time in seconds that you allow the check to take (default 2)
- ``interval``: Time interval between two checks, in seconds. Defaults to :ref:`setting-lua-health-checks-interval` if not specified.
- ``minimumFailures``: The number of unsuccessful checks in a row required to mark the address as down. Defaults to 1 if not specified, i.e. report as down on the first unsuccessful check.


.. function:: ifurlup(url, addresses[, options])
Expand All @@ -86,9 +88,11 @@ Record creation functions
- ``backupSelector``: used to pick the address from all addresses if all addresses are down. Choices include 'pickclosest', 'random', 'hashed', 'all' (default 'random').
- ``source``: Source address to check from
- ``timeout``: Maximum time in seconds that you allow the check to take (default 2)
- ``interval``: Time interval between two checks, in seconds. Defaults to :ref:`setting-lua-health-checks-interval` if not specified.
- ``stringmatch``: check ``url`` for this string, only declare 'up' if found
- ``useragent``: Set the HTTP "User-Agent" header in the requests. By default it is set to "PowerDNS Authoritative Server"
- ``byteslimit``: Limit the maximum download size to ``byteslimit`` bytes (default 0 meaning no limit).
- ``minimumFailures``: The number of unsuccessful checks in a row required to mark the address as down. Defaults to 1 if not specified, i.e. report as down on the first unsuccessful check.

An example of a list of address sets:

Expand Down
65 changes: 55 additions & 10 deletions pdns/lua-record.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
ponder netmask tree from file for huge number of netmasks
unify ifurlup/ifportup
add attribute for certificate check
add attribute for certificate check in genericIfUp
add list of current monitors
expire them too?
Expand Down Expand Up @@ -71,10 +71,14 @@ class IsUpOracle
CheckState(time_t _lastAccess): lastAccess(_lastAccess) {}
/* current status */
std::atomic<bool> status{false};
/* first check ? */
/* first check? */
std::atomic<bool> first{true};
/* number of successive checks returning failure */
std::atomic<unsigned int> failures{0};
/* last time the status was accessed */
std::atomic<time_t> lastAccess{0};
/* last time the status was modified */
std::atomic<time_t> lastStatusUpdate{0};
};

public:
Expand All @@ -88,7 +92,7 @@ class IsUpOracle
bool isUp(const CheckDesc& cd);

private:
void checkURL(const CheckDesc& cd, const bool status, const bool first = false)
void checkURL(const CheckDesc& cd, const bool status, const bool first) // NOLINT(readability-identifier-length)
{
setThreadName("pdns/lua-c-url");

Expand Down Expand Up @@ -139,7 +143,7 @@ class IsUpOracle
setDown(cd);
}
}
void checkTCP(const CheckDesc& cd, const bool status, const bool first = false) {
void checkTCP(const CheckDesc& cd, const bool status, const bool first) { // NOLINT(readability-identifier-length)
setThreadName("pdns/lua-c-tcp");
try {
int timeout = 2;
Expand Down Expand Up @@ -177,19 +181,46 @@ class IsUpOracle
std::chrono::system_clock::time_point checkStart = std::chrono::system_clock::now();
std::vector<std::future<void>> results;
std::vector<CheckDesc> toDelete;
time_t interval{g_luaHealthChecksInterval};
{
// make sure there's no insertion
auto statuses = d_statuses.read_lock();
for (auto& it: *statuses) {
auto& desc = it.first;
auto& state = it.second;
time_t checkInterval{0};
auto lastAccess = std::chrono::system_clock::from_time_t(state->lastAccess);

if (desc.opts.count("interval") != 0) {
checkInterval = std::atoi(desc.opts.at("interval").c_str());
if (checkInterval != 0) {
interval = std::gcd(interval, checkInterval);
}
}

if (not state->first) {
time_t nextCheckSecond = state->lastStatusUpdate;
if (checkInterval != 0) {
nextCheckSecond += checkInterval;
}
else {
nextCheckSecond += g_luaHealthChecksInterval;
}
if (checkStart < std::chrono::system_clock::from_time_t(nextCheckSecond)) {
continue; // too early
}
}

if (desc.url.empty()) { // TCP
results.push_back(std::async(std::launch::async, &IsUpOracle::checkTCP, this, desc, state->status.load(), state->first.load()));
} else { // URL
results.push_back(std::async(std::launch::async, &IsUpOracle::checkURL, this, desc, state->status.load(), state->first.load()));
}
if (std::chrono::system_clock::from_time_t(state->lastAccess) < (checkStart - std::chrono::seconds(g_luaHealthChecksExpireDelay))) {
// Give it a chance to run at least once.
// If minimumFailures * interval > lua-health-checks-expire-delay, then a down status will never get reported.
// This is unlikely to be a problem in practice due to the default value of the expire delay being one hour.
if (not state->first &&
lastAccess < (checkStart - std::chrono::seconds(g_luaHealthChecksExpireDelay))) {
toDelete.push_back(desc);
}
}
Expand All @@ -208,7 +239,7 @@ class IsUpOracle
// set thread name again, in case std::async surprised us by doing work in this thread
setThreadName("pdns/luaupcheck");

std::this_thread::sleep_until(checkStart + std::chrono::seconds(g_luaHealthChecksInterval));
std::this_thread::sleep_until(checkStart + std::chrono::seconds(interval));
}
}

Expand All @@ -222,9 +253,23 @@ class IsUpOracle
{
auto statuses = d_statuses.write_lock();
auto& state = (*statuses)[cd];
state->status = status;
if (state->first) {
state->first = false;
state->lastStatusUpdate = time(nullptr);
state->first = false;
if (status) {
state->failures = 0;
state->status = true;
} else {
unsigned int minimumFailures = 1;
if (cd.opts.count("minimumFailures") != 0) {
unsigned int value = std::atoi(cd.opts.at("minimumFailures").c_str());
if (value != 0) {
minimumFailures = std::max(minimumFailures, value);
}
}
// Since `status' was set to false at constructor time, we need to
// recompute its value unconditionally to expose "down, but not enough
// times yet" targets as up.
state->status = ++state->failures < minimumFailures;
}
}

Expand Down
23 changes: 23 additions & 0 deletions regression-tests.auth-py/authtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,29 @@ def assertAnyRRsetInAnswer(self, msg, rrsets):
raise AssertionError("RRset not found in answer\n%s" %
"\n".join(([ans.to_text() for ans in msg.answer])))

def assertNoneRRsetInAnswer(self, msg, rrsets):
"""Asserts that none of the supplied rrsets exist (without comparing TTL)
in the answer section of msg
@param msg: the dns.message.Message to check
@param rrsets: an array of dns.rrset.RRset object"""

if not isinstance(msg, dns.message.Message):
raise TypeError("msg is not a dns.message.Message")

found = False
for rrset in rrsets:
if not isinstance(rrset, dns.rrset.RRset):
raise TypeError("rrset is not a dns.rrset.RRset")
for ans in msg.answer:
if ans.match(rrset.name, rrset.rdclass, rrset.rdtype, 0, None):
if ans == rrset:
found = True

if found:
raise AssertionError("RRset incorrectly found in answer\n%s" %
"\n".join(([ans.to_text() for ans in msg.answer])))

def assertMatchingRRSIGInAnswer(self, msg, coveredRRset, keys=None):
"""Looks for coveredRRset in the answer section and if there is an RRSIG RRset
that covers that RRset. If keys is not None, this function will also try to
Expand Down
119 changes: 115 additions & 4 deletions regression-tests.auth-py/test_LuaRecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def log_message(self, format, *args):
def do_HEAD(self):
self._set_headers()

class TestLuaRecords(AuthTest):
class BaseLuaTest(AuthTest):
_config_template = """
geoip-database-files=../modules/geoipbackend/regression-tests/GeoLiteCity.mmdb
edns-subnet-processing=yes
Expand Down Expand Up @@ -99,6 +99,16 @@ class TestLuaRecords(AuthTest):
"return ifurlup('http://www.lua.org:8080/', "
"{{EUEips, USAips}}, settings) ")
usa-unreachable IN LUA A ( ";settings={{stringmatch='Programming in Lua', minimumFailures=3}} "
"USAips={{'{prefix}.103', '192.168.42.105'}}"
"return ifurlup('http://www.lua.org:8080/', "
"USAips, settings) ")
usa-slowcheck IN LUA A ( ";settings={{stringmatch='Programming in Lua', interval=8}} "
"USAips={{'{prefix}.103', '192.168.42.105'}}"
"return ifurlup('http://www.lua.org:8080/', "
"USAips, settings) ")
mix.ifurlup IN LUA A ("ifurlup('http://www.other.org:8080/ping.json', "
"{{ '192.168.42.101', '{prefix}.101' }}, "
"{{ stringmatch='pong' }}) ")
Expand Down Expand Up @@ -192,7 +202,7 @@ def HTTPResponder(cls, port):
@classmethod
def setUpClass(cls):

super(TestLuaRecords, cls).setUpClass()
super(BaseLuaTest, cls).setUpClass()

cls._web_rrsets = [dns.rrset.from_text('web1.example.org.', 0, dns.rdataclass.IN, 'A',
'{prefix}.101'.format(prefix=cls._PREFIX)),
Expand All @@ -202,6 +212,8 @@ def setUpClass(cls):
'{prefix}.103'.format(prefix=cls._PREFIX))
]

class TestLuaRecords(BaseLuaTest):

def testPickRandom(self):
"""
Basic pickrandom() test with a set of A records
Expand Down Expand Up @@ -425,7 +437,7 @@ def testIfurlup(self):
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# the timeout in the LUA health checker is 2 second, so we make sure to wait slightly longer here
# the timeout in the LUA health checker is 1 second, so we make sure to wait slightly longer here
time.sleep(3)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
Expand Down Expand Up @@ -453,7 +465,7 @@ def testIfurlupMultiSet(self):
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# the timeout in the LUA health checker is 2 second, so we make sure to wait slightly longer here
# the timeout in the LUA health checker is 1 second, so we make sure to wait slightly longer here
time.sleep(3)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
Expand Down Expand Up @@ -1146,6 +1158,105 @@ class TestLuaRecordsNoWhiteSpace(TestLuaRecords):
def testWhitespace(self):
return TestLuaRecords.testWhitespace(self, False)

class TestLuaRecordsSlowTimeouts(BaseLuaTest):
# This configuration is similar to BaseLuaTest, but the health check
# interval is increased to 5 seconds.
_config_template = """
geoip-database-files=../modules/geoipbackend/regression-tests/GeoLiteCity.mmdb
edns-subnet-processing=yes
launch=bind geoip
any-to-tcp=no
enable-lua-records
lua-records-insert-whitespace=yes
lua-health-checks-interval=5
"""

def testIfurlupMinimumFailures(self):
"""
Simple ifurlup() test with minimumFailures option set.
"""
reachable = [
'{prefix}.103'.format(prefix=self._PREFIX)
]
unreachable = ['192.168.42.105']
ips = reachable + unreachable
all_rrs = []
reachable_rrs = []
unreachable_rrs = []
for ip in ips:
rr = dns.rrset.from_text('usa-unreachable.example.org.', 0, dns.rdataclass.IN, 'A', ip)
all_rrs.append(rr)
if ip in reachable:
reachable_rrs.append(rr)
else:
unreachable_rrs.append(rr)

query = dns.message.make_query('usa-unreachable.example.org', 'A')
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# the timeout in the LUA health checker is 5 second, so we make sure to
# wait slightly longer here, but not too much in order to not reach the
# third loop iteration of the checker thread.
time.sleep(5 + 2)

res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
# due to minimumFailures set, there should be no error yet
self.assertAnyRRsetInAnswer(res, all_rrs)

# wait for another check. At this point the checker thread should have
# reached the minimumFailures threshold and mark the unreachable IP
# as such.
time.sleep(5 + 2)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, reachable_rrs)
self.assertNoneRRsetInAnswer(res, unreachable_rrs)

def testIfurlupInterval(self):
"""
Simple ifurlup() test with interval option set.
"""
reachable = [
'{prefix}.103'.format(prefix=self._PREFIX)
]
unreachable = ['192.168.42.105']
ips = reachable + unreachable
all_rrs = []
reachable_rrs = []
unreachable_rrs = []
for ip in ips:
rr = dns.rrset.from_text('usa-slowcheck.example.org.', 0, dns.rdataclass.IN, 'A', ip)
all_rrs.append(rr)
if ip in reachable:
reachable_rrs.append(rr)
else:
unreachable_rrs.append(rr)

query = dns.message.make_query('usa-slowcheck.example.org', 'A')
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# the timeout in the LUA health checker is 5 second, but usa-slowcheck
# uses 8 seconds, which forces the thread to run every second (gcd
# of 5 and 8).
time.sleep(6)

res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
# due to minimumFailures set, there should be no error yet
self.assertAnyRRsetInAnswer(res, all_rrs)

# At this point the check should have fired.
time.sleep(3)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, reachable_rrs)
self.assertNoneRRsetInAnswer(res, unreachable_rrs)

if __name__ == '__main__':
unittest.main()
exit(0)

0 comments on commit c588357

Please sign in to comment.