Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/19026.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Shorten DNS resolver timeout/retry sequence from 60s to 15s to ensure DNS failures are visible before federation HTTP request timeouts.
39 changes: 38 additions & 1 deletion synapse/http/federation/srv_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import attr

from twisted.internet import defer
from twisted.internet.error import ConnectError
from twisted.names import client, dns
from twisted.names.error import DNSNameError, DNSNotImplementedError, DomainError
Expand All @@ -36,6 +37,20 @@

SERVER_CACHE: Dict[bytes, List["Server"]] = {}

DNS_LOOKUP_TIMEOUTS = (
1, # Quick retry for packet loss/scenarios
3, # Still reasonable for slow responders
3, # ...
3, # Already catching 99.9% of successful queries at 10s
# Final attempt for extreme edge cases.
#
# TODO: In the future (after 2026-01-01), we could consider removing this extra
# time if we don't see complaints. For comparison, the Windows
# DNS resolver gives up after 10s using `(1, 1, 2, 4, 2)`, see
# https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/dns-client-resolution-timeouts
5,
)


@attr.s(auto_attribs=True, slots=True, frozen=True)
class Server:
Expand Down Expand Up @@ -145,7 +160,25 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:

try:
answers, _, _ = await make_deferred_yieldable(
self._dns_client.lookupService(service_name)
self._dns_client.lookupService(
service_name,
# This is a sequence of ints that represent the "number of seconds
# after which to reissue the query. When the last timeout expires,
# the query is considered failed." The default value in Twisted is
# `timeout=(1, 3, 11, 45)` (60s total) which is an "arbitrary"
# exponential backoff sequence and is too long (see below).
#
# We want the total timeout to be below the overarching HTTP request
# timeout (60s for federation requests) that spurred on this lookup.
# This way, we can see the underlying DNS failure and move on
# instead of the user ending up with a generic HTTP request timeout.
#
# Since these DNS queries are done over UDP (unreliable transport),
# by it's nature, it's bound to occasionally fail (dropped packets,
# etc). We want a list that starts small and re-issues DNS queries
# multiple times until we get a response or timeout.
timeout=DNS_LOOKUP_TIMEOUTS,
)
)
except DNSNameError:
# TODO: cache this. We can get the SOA out of the exception, and use
Expand All @@ -165,6 +198,10 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
return list(cache_entry)
else:
raise e
except defer.TimeoutError as e:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have a test that stressed this part of the code. Especially since I'm unsure if defer.TimeoutError is actually the exception type raised here.

Do you think you would be up for that? Probably involves reactor.advance(15 + 1) to advance time past the timeout. Otherwise, I can take a stab at it.

raise defer.TimeoutError(
f"Timed out while trying to resolve DNS for SRV record for {service_name!r} (timeout={sum(DNS_LOOKUP_TIMEOUTS)}s)"
) from e

if (
len(answers) == 1
Expand Down
8 changes: 6 additions & 2 deletions tests/http/federation/test_srv_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ def do_lookup() -> Generator["Deferred[object]", object, List[Server]]:
test_d = do_lookup()
self.assertNoResult(test_d)

dns_client_mock.lookupService.assert_called_once_with(service_name)
dns_client_mock.lookupService.assert_called_once_with(
service_name, timeout=(1, 3, 3, 3, 5)
)

result_deferred.callback(([answer_srv], None, None))

Expand Down Expand Up @@ -98,7 +100,9 @@ def test_from_cache_expired_and_dns_fail(
servers: List[Server]
servers = yield defer.ensureDeferred(resolver.resolve_service(service_name)) # type: ignore[assignment]

dns_client_mock.lookupService.assert_called_once_with(service_name)
dns_client_mock.lookupService.assert_called_once_with(
service_name, timeout=(1, 3, 3, 3, 5)
)

self.assertEqual(len(servers), 1)
self.assertEqual(servers, cache[service_name])
Expand Down