- 
                Notifications
    You must be signed in to change notification settings 
- Fork 404
srv_resolver: add 15s timeout to DNS lookups #19026
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Shorten DNS resolver timeout/retry sequence from 60s to 15s to ensure DNS failures are visible before federation HTTP request timeouts. | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -26,6 +26,7 @@ | |
|  | ||
| import attr | ||
|  | ||
| from twisted.internet import defer | ||
| from twisted.internet.error import ConnectError | ||
| from twisted.names import client, dns | ||
| from twisted.names.error import DNSNameError, DNSNotImplementedError, DomainError | ||
|  | @@ -36,6 +37,20 @@ | |
|  | ||
| SERVER_CACHE: Dict[bytes, List["Server"]] = {} | ||
|  | ||
| DNS_LOOKUP_TIMEOUTS = ( | ||
| 1, # Quick retry for packet loss/scenarios | ||
| 3, # Still reasonable for slow responders | ||
| 3, # ... | ||
| 3, # Already catching 99.9% of successful queries at 10s | ||
| # Final attempt for extreme edge cases. | ||
| # | ||
| # TODO: In the future (after 2026-01-01), we could consider removing this extra | ||
| # time if we don't see complaints. For comparison, the Windows | ||
| # DNS resolver gives up after 10s using `(1, 1, 2, 4, 2)`, see | ||
| # https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/dns-client-resolution-timeouts | ||
| 5, | ||
| ) | ||
|  | ||
|  | ||
| @attr.s(auto_attribs=True, slots=True, frozen=True) | ||
| class Server: | ||
|  | @@ -145,7 +160,25 @@ async def resolve_service(self, service_name: bytes) -> List[Server]: | |
|  | ||
|         
                  amaanq marked this conversation as resolved.
              Show resolved
            Hide resolved | ||
| try: | ||
| answers, _, _ = await make_deferred_yieldable( | ||
| self._dns_client.lookupService(service_name) | ||
| self._dns_client.lookupService( | ||
| service_name, | ||
| # This is a sequence of ints that represent the "number of seconds | ||
| # after which to reissue the query. When the last timeout expires, | ||
| # the query is considered failed." The default value in Twisted is | ||
| # `timeout=(1, 3, 11, 45)` (60s total) which is an "arbitrary" | ||
| # exponential backoff sequence and is too long (see below). | ||
| # | ||
| # We want the total timeout to be below the overarching HTTP request | ||
| # timeout (60s for federation requests) that spurred on this lookup. | ||
| # This way, we can see the underlying DNS failure and move on | ||
| # instead of the user ending up with a generic HTTP request timeout. | ||
| # | ||
| # Since these DNS queries are done over UDP (unreliable transport), | ||
| # by it's nature, it's bound to occasionally fail (dropped packets, | ||
| # etc). We want a list that starts small and re-issues DNS queries | ||
| # multiple times until we get a response or timeout. | ||
| timeout=DNS_LOOKUP_TIMEOUTS, | ||
| ) | ||
| ) | ||
| except DNSNameError: | ||
|         
                  amaanq marked this conversation as resolved.
              Show resolved
            Hide resolved | ||
| # TODO: cache this. We can get the SOA out of the exception, and use | ||
|  | @@ -165,6 +198,10 @@ async def resolve_service(self, service_name: bytes) -> List[Server]: | |
| return list(cache_entry) | ||
| else: | ||
| raise e | ||
| except defer.TimeoutError as e: | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to have a test that stressed this part of the code. Especially since I'm unsure if  Do you think you would be up for that? Probably involves  | ||
| raise defer.TimeoutError( | ||
| f"Timed out while trying to resolve DNS for SRV record for {service_name!r} (timeout={sum(DNS_LOOKUP_TIMEOUTS)}s)" | ||
| ) from e | ||
|  | ||
| if ( | ||
| len(answers) == 1 | ||
|  | ||
Uh oh!
There was an error while loading. Please reload this page.